sd16spring · mary-keenan · Mar 22, 2016
diff --git a/figure_1.png b/figure_1.png
diff --git a/figure_2.png b/figure_2.png
diff --git a/learning_curve.py b/learning_curve.py
@@ -7,17 +7,21 @@
 from sklearn.linear_model import LogisticRegression
 
 data = load_digits()
-print data.DESCR
-num_trials = 10
-train_percentages = range(5,95,5)
-test_accuracies = numpy.zeros(len(train_percentages))
+num_trials = 100
+train_percentages = range(5,90,5)
+test_accuracies = [] #numpy.zeros(len(train_percentages))
 
-# train a model with training percentages between 5 and 90 (see train_percentages) and evaluate
-# the resultant accuracy.
-# You should repeat each training percentage num_trials times to smooth out variability
-# for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner
-
-# TODO: your code here
+for i in train_percentages: #runs through different percentages
+	list_of_n = []
+	for n in range(num_trials): #runs a given number of times --> average data
+		#Partition data into two sets: training and testing.
+		X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=i)
+		model = LogisticRegression(C=10**-10)#10**-10) #use the Multinomial Logistic Regression algorithm
+		model.fit(X_train, y_train)
+		accuracy_of_n = model.score(X_test,y_test) #find accuracy
+		list_of_n.append(accuracy_of_n) #add each new score to list
+	average_n = sum(list_of_n)/len(list_of_n) #average list
+	test_accuracies.append(average_n) #append average to test_accuracies list
 
 fig = plt.figure()
 plt.plot(train_percentages, test_accuracies)

diff --git a/questions.txt b/questions.txt
@@ -0,0 +1,7 @@
+1. The general trend of the curve is up.
+
+2. The middle of the curve is the noisiest. I think this is because when the training percentage is low/high, the machine is more likely to get a definitively low/high answer: It either doesn't have nearly enough information or it has enough to make a good guess. When the training percentage is somewhere in the middle, the machine's success is more variable, because sometimes it does have enough information and sometimes it doesn't. This results in more noise in the center of the graph, between about 30% and 50%.
+
+3. It takes about 1000 trials to get a smooth-ish curve.
+
+4. When I increase the value of C, the curve looks much better. The lower the value of C, the noisier the curve.