sd17spring · jeffBP · Apr 17, 2017
diff --git a/learning_curve.py b/learning_curve.py
@@ -21,7 +21,7 @@ def display_digits():
 
 def train_model():
     """Train a model on pictures of digits.
-    
+
     Read in 8x8 pictures of numbers and evaluate the accuracy of the model
     when different percentages of the data are used as training data. This function
     plots the average accuracy of the model as a function of the percent of data
@@ -32,14 +32,23 @@ def train_model():
     train_percentages = range(5, 95, 5)
     test_accuracies = numpy.zeros(len(train_percentages))
 
+
     # train models with training percentages between 5 and 90 (see
     # train_percentages) and evaluate the resultant accuracy for each.
     # You should repeat each training percentage num_trials times to smooth out
     # variability.
     # For consistency with the previous example use
     # model = LogisticRegression(C=10**-10) for your learner
 
-    # TODO: your code here
+    for i, q in enumerate(train_percentages):
+        summation = 0
+        for j in range(num_trials):
+            X_train, X_test, y_train, y_test = train_test_split(data.data, data.target,
+                                                                train_size=q/100)
+            model = LogisticRegression(C=10**-17)
+            model.fit(X_train, y_train)
+            summation += model.score(X_test, y_test)
+        test_accuracies[i] = summation/num_trials
 
     fig = plt.figure()
     plt.plot(train_percentages, test_accuracies)
@@ -51,4 +60,4 @@ def train_model():
 if __name__ == "__main__":
     # Feel free to comment/uncomment as needed
     display_digits()
-    # train_model()
+    train_model()
diff --git a/questions.txt b/questions.txt
@@ -0,0 +1,17 @@
+I learned a few things from this toolbox.
+There is a positive correlation between the size of the training set
+and the accuracy of the model. In addition, as you increase the number
+of trials, the curve begins to smooth out. I found that around 50 trials
+smooths out the curve almost completely. The C value for the logical
+regression also plays a role in the accuracy of the model. As you decrease
+the magnitude of the power of ten, the model gets more accurate. As you
+approach a certain value however, there is a drop off in the accuracy of
+the model. It drops almost to zero. I found this out at C= 10^-20. I think
+this is because the model just cant get that low of an r value at certain
+training set sizes.
+
+I would have loved some more background on exactly what scikit learn is
+doing here. I know it is important to know the implementation, but it feels
+very much like an unknowable black-box. I know it is essentially creating
+an equation based on the training set to evaluate other images, but
+how it does this is a mystery to me, and I feel like It is something good to learn
diff --git a/runner.py b/runner.py
@@ -0,0 +1,12 @@
+from sklearn.datasets import *
+from sklearn.cross_validation import train_test_split
+from sklearn.linear_model import LogisticRegression
+
+data = load_digits()
+X_train, X_test, y_train, y_test = train_test_split(data.data, data.target,
+                                                    train_size=0.5)
+
+model = LogisticRegression(C=10**-10)
+model.fit(X_train, y_train)
+print("Train accuracy %f" % model.score(X_train, y_train))
+print("Test accuracy %f" % model.score(X_test, y_test))