sd16spring · bwerth · Mar 28, 2016
diff --git a/learning_curve.py b/learning_curve.py
@@ -11,16 +11,41 @@
 num_trials = 10
 train_percentages = range(5,95,5)
 test_accuracies = numpy.zeros(len(train_percentages))
+print train_percentages
+
+digits = load_digits()
+print digits.DESCR
+fig = plt.figure()
+for i in range(10):
+    subplot = fig.add_subplot(5,2,i+1)
+    subplot.matshow(numpy.reshape(digits.data[i],(8,8)),cmap='gray')
+
+plt.show()
 
 # train a model with training percentages between 5 and 90 (see train_percentages) and evaluate
 # the resultant accuracy.
 # You should repeat each training percentage num_trials times to smooth out variability
 # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner
+test_accuracies = []
+n = 0
+while n<len(train_percentages):
+	test_accuracy_total = 0
+	for i in range(num_trials):
+		data = load_digits()
+		X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size = train_percentages[n]/100.0)
+		model = LogisticRegression(C=10**-1)
+		model.fit(X_train, y_train)
+		test_accuracy_total += model.score(X_test,y_test)
+	test_accuracy = test_accuracy_total/num_trials
+	test_accuracies.append(test_accuracy)	
+	print "Train Size %f" %train_percentages[n] 
+	print "Test accuracy %f" %test_accuracy
+	n += 1
 
-# TODO: your code here
 
 fig = plt.figure()
 plt.plot(train_percentages, test_accuracies)
 plt.xlabel('Percentage of Data Used for Training')
 plt.ylabel('Accuracy on Test Set')
+plt.title('Accuracy on Test Set as a Function of Percentage of Data Used for Training')
 plt.show()
diff --git a/questions.txt b/questions.txt
@@ -0,0 +1,5 @@
+1. The accuracy on test set was correlated to the percentage of data used for training.
+2. The noisiest part of the graph seems to be the middle, ranging from 25 percent to 65 percent. The likely reason for this is there is a larger variation in results when the percentage of data used for training is not extreme. More trials would probably smooth out the curve and reduce noise.
+3. Although there were always variations, the curve seemed to be a lot smoother when I did 25 tests instead of 10.
+4. As C decreased, the accuracy got higher at lower percentage of data used for training.
+