sd16spring · marches · Mar 13, 2016 · Mar 13, 2016
diff --git a/learning_curve.py b/learning_curve.py
@@ -1,23 +1,35 @@
-""" Exploring learning curves for classification of handwritten digits """
+""" Exploring learning curves for classification of handwritten digits."""
 
 import matplotlib.pyplot as plt
 import numpy
 from sklearn.datasets import *
 from sklearn.cross_validation import train_test_split
 from sklearn.linear_model import LogisticRegression
 
+
 data = load_digits()
 print data.DESCR
 num_trials = 10
 train_percentages = range(5,95,5)
 test_accuracies = numpy.zeros(len(train_percentages))
 
-# train a model with training percentages between 5 and 90 (see train_percentages) and evaluate
-# the resultant accuracy.
-# You should repeat each training percentage num_trials times to smooth out variability
-# for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner
 
-# TODO: your code here
+for i in range(len(train_percentages)):
+
+    training = train_percentages[i]/100.0
+    results = []
+
+    for j in range(num_trials):
+
+        X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size= training)
+        model = LogisticRegression(C=15**-10)
+        model.fit(X_train, y_train)
+        score = model.score(X_test,y_test)
+
+        results.append(score)
+
+    test_accuracies[i] = numpy.average(results)
+
 
 fig = plt.figure()
 plt.plot(train_percentages, test_accuracies)

diff --git a/questions.txt b/questions.txt
@@ -0,0 +1,8 @@
+The general trend of the curve is upward. It increases more rapidly over lower percentages of data used, and increases less rapidly over higher percentages of data used. 
+
+The curve appears to have noise between 30-60 percent of data used fairly consistently. I think this may have to do with the amount of data available for learning when compared to the amount available for testing.
+
+Around 100 trials are required for a decently smooth curve. Upwards of 400 trials give the curve a much smoother finish.
+
+It appears that the C value of 10 smooths the curve out so that it increases in a more linear progression. A graph with a C value of 2 looks very logarithmic. It also starts with an accuracy around 0.82.
+