sd16spring · JosephLee19 · Mar 27, 2016 · Mar 27, 2016
diff --git a/learning_curve.py b/learning_curve.py
@@ -1,5 +1,3 @@
-""" Exploring learning curves for classification of handwritten digits """
-
 import matplotlib.pyplot as plt
 import numpy
 from sklearn.datasets import *
@@ -8,19 +6,26 @@
 
 data = load_digits()
 print data.DESCR
-num_trials = 10
-train_percentages = range(5,95,5)
-test_accuracies = numpy.zeros(len(train_percentages))
-
+num_trials = 500
+train_percentages = range(5,95,1)
+test_accuracies = []
+ 
 # train a model with training percentages between 5 and 90 (see train_percentages) and evaluate
 # the resultant accuracy.
 # You should repeat each training percentage num_trials times to smooth out variability
 # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner
 
-# TODO: your code here
+for train_percentage in train_percentages: # For each percentage in train_percentages (goes from 5% to 95% in increments of 1%)
+    running_average_variable = 0
+    for i in range(0,num_trials): # Run the number of trials specified
+       X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size = train_percentage)
+       model = LogisticRegression(C=10**-10)
+       model.fit(X_train, y_train)
+       running_average_variable += model.score(X_test, y_test) # Add up all of the running averages
+    test_accuracies.append(running_average_variable/num_trials) # divide running average by the number of trials to attain actual average
 
 fig = plt.figure()
 plt.plot(train_percentages, test_accuracies)
 plt.xlabel('Percentage of Data Used for Training')
 plt.ylabel('Accuracy on Test Set')
-plt.show()
+plt.show() 
diff --git a/questions.txt b/questions.txt
@@ -0,0 +1,9 @@
+    There is a general upwards trend in the curve...i.e. as the training set gets larger, the results are mor accurate
+
+    It seems particularly noisy when the training set is around 80-90%...probably because at that point the test set is so small that
+    a few oddball results really impact the rest of the curve
+
+    I achieved a readably smooth curve around 50 trials.  At 500 trials, the curve was much smoother.
+
+    I didn't notice much difference other than it seemed to change the amount of noise.
+    I am not completely sure if this is what I should be seeing or not though