sd16spring · IanOlin · Mar 29, 2016 · Mar 29, 2016
diff --git a/1000_trials.png b/1000_trials.png
diff --git a/100_trials.png b/100_trials.png
diff --git a/10_trials.png b/10_trials.png
diff --git a/learning_curve.py b/learning_curve.py
@@ -1,26 +1,28 @@
 """ Exploring learning curves for classification of handwritten digits """
 
 import matplotlib.pyplot as plt
-import numpy
 from sklearn.datasets import *
 from sklearn.cross_validation import train_test_split
 from sklearn.linear_model import LogisticRegression
 
 data = load_digits()
-print data.DESCR
 num_trials = 10
-train_percentages = range(5,95,5)
-test_accuracies = numpy.zeros(len(train_percentages))
+train_percentages = map(lambda x: x/100.0, range(5,95,5))
 
-# train a model with training percentages between 5 and 90 (see train_percentages) and evaluate
-# the resultant accuracy.
-# You should repeat each training percentage num_trials times to smooth out variability
-# for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner
 
-# TODO: your code here
 
+def train(percent, trials):
+    scores = []
+    model = LogisticRegression(C=10**-10)
+    for i in range(trials):
+        X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=(percent)) #Split the data into test and training
+        model.fit(X_train, y_train) #Train the model on the test set
+        scores.append(model.score(X_test,y_test)) #store the results 
+    return sum(scores)/float(trials) #return the average result
+
+results = [train(percent, num_trials) for percent in train_percentages] #list comprehension to iterate through percentages
 fig = plt.figure()
-plt.plot(train_percentages, test_accuracies)
+plt.plot(train_percentages, results)
 plt.xlabel('Percentage of Data Used for Training')
 plt.ylabel('Accuracy on Test Set')
 plt.show()
diff --git a/questions.txt b/questions.txt
@@ -0,0 +1,11 @@
+1. The general trend in the curve is that as you use a larger percentage of your data for training, you get better results in testing
+This is because it has more time to 'learn' and it can generate more accurate factors for the test.
+The curve could reasonable be approximated to a line, but it actually looks much more like a log
+
+2. Some parts of the curve are noisier because the randomness for how it's broken up matters more for smaller values
+For really small training sets you can get a set that is highly representative of handwriting or one that is much less representative
+
+3. You my curve at 10 was distinctly not smooth, at 100 it was roughly not smooth, at 1000 it was very smooth
+somewhere between 100 and 1000 you will get a smooth curve, but it's garunteed on the at 10^3 trials
+
+4.C is the how accurate you want the model to be. At higher values it becomes more precise and at lower values the model is rougher.