From ae4f8182c5b8849e2a31c3acd6f3b849aa9ef4b5 Mon Sep 17 00:00:00 2001 From: Jeff Pflueger Date: Mon, 17 Apr 2017 14:54:23 -0400 Subject: [PATCH] Did toolbox, wrote writeup --- learning_curve.py | 15 ++++++++++++--- questions.txt | 17 +++++++++++++++++ runner.py | 12 ++++++++++++ 3 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 questions.txt create mode 100644 runner.py diff --git a/learning_curve.py b/learning_curve.py index 2baa81b..7eb4317 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -21,7 +21,7 @@ def display_digits(): def train_model(): """Train a model on pictures of digits. - + Read in 8x8 pictures of numbers and evaluate the accuracy of the model when different percentages of the data are used as training data. This function plots the average accuracy of the model as a function of the percent of data @@ -32,6 +32,7 @@ def train_model(): train_percentages = range(5, 95, 5) test_accuracies = numpy.zeros(len(train_percentages)) + # train models with training percentages between 5 and 90 (see # train_percentages) and evaluate the resultant accuracy for each. # You should repeat each training percentage num_trials times to smooth out @@ -39,7 +40,15 @@ def train_model(): # For consistency with the previous example use # model = LogisticRegression(C=10**-10) for your learner - # TODO: your code here + for i, q in enumerate(train_percentages): + summation = 0 + for j in range(num_trials): + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, + train_size=q/100) + model = LogisticRegression(C=10**-17) + model.fit(X_train, y_train) + summation += model.score(X_test, y_test) + test_accuracies[i] = summation/num_trials fig = plt.figure() plt.plot(train_percentages, test_accuracies) @@ -51,4 +60,4 @@ def train_model(): if __name__ == "__main__": # Feel free to comment/uncomment as needed display_digits() - # train_model() + train_model() diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..9ae90ee --- /dev/null +++ b/questions.txt @@ -0,0 +1,17 @@ +I learned a few things from this toolbox. +There is a positive correlation between the size of the training set +and the accuracy of the model. In addition, as you increase the number +of trials, the curve begins to smooth out. I found that around 50 trials +smooths out the curve almost completely. The C value for the logical +regression also plays a role in the accuracy of the model. As you decrease +the magnitude of the power of ten, the model gets more accurate. As you +approach a certain value however, there is a drop off in the accuracy of +the model. It drops almost to zero. I found this out at C= 10^-20. I think +this is because the model just cant get that low of an r value at certain +training set sizes. + +I would have loved some more background on exactly what scikit learn is +doing here. I know it is important to know the implementation, but it feels +very much like an unknowable black-box. I know it is essentially creating +an equation based on the training set to evaluate other images, but +how it does this is a mystery to me, and I feel like It is something good to learn diff --git a/runner.py b/runner.py new file mode 100644 index 0000000..2c1fccb --- /dev/null +++ b/runner.py @@ -0,0 +1,12 @@ +from sklearn.datasets import * +from sklearn.cross_validation import train_test_split +from sklearn.linear_model import LogisticRegression + +data = load_digits() +X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, + train_size=0.5) + +model = LogisticRegression(C=10**-10) +model.fit(X_train, y_train) +print("Train accuracy %f" % model.score(X_train, y_train)) +print("Test accuracy %f" % model.score(X_test, y_test))