sd16spring · poosomooso · Apr 13, 2016
diff --git a/learning_curve.py b/learning_curve.py
@@ -1,26 +1,88 @@
 """ Exploring learning curves for classification of handwritten digits """
 
 import matplotlib.pyplot as plt
-import numpy
+import numpy as np
 from sklearn.datasets import *
 from sklearn.cross_validation import train_test_split
 from sklearn.linear_model import LogisticRegression
 
 data = load_digits()
 print data.DESCR
-num_trials = 10
+
+X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=0.5)
+model = LogisticRegression(C=10**-10)
+model.fit(X_train, y_train)
+print "Train accuracy %f" %model.score(X_train,y_train)
+print "Test accuracy %f"%model.score(X_test,y_test)
+
+# fig = plt.figure()
+# for i in range(10):
+#     subplot = fig.add_subplot(5,2,i+1)
+#     subplot.matshow(np.reshape(data.data[i],(8,8)), cmap='gray')
+# plt.show()
+
+num_trials = 50 #200 is super accurate
+index = 0
+
 train_percentages = range(5,95,5)
-test_accuracies = numpy.zeros(len(train_percentages))
+test_accuracies = []
+
+X_train_total = {}
+X_test_total = {}
+y_train_total = {}
+y_test_total = {}
+
+
+#loop through different c values, if you want.
+for c in [0]:#ange(0, -20, -5):
+    fig = plt.figure()
+    Cval = 10**c
+    for i in range(num_trials):
+        for p in train_percentages:
+            results = train_test_split(data.data, data.target, train_size=p/100.0)
+            if i == 0:
+
+                X_train_total[p] = results[0]
+                X_test_total[p] = results[1]
+                y_train_total[p] = results[2]
+                y_test_total[p] = results[3]
+            else:
+                X_train_total[p] = np.add(X_train_total.get(p, 0), results[0])
+                X_test_total[p] = np.add(X_test_total.get(p, 0), results[1])
+                y_train_total[p] = np.add(y_train_total.get(p, 0), results[2])
+                y_test_total[p] = np.add(y_test_total.get(p, 0), results[3])
 
-# train a model with training percentages between 5 and 90 (see train_percentages) and evaluate
-# the resultant accuracy.
-# You should repeat each training percentage num_trials times to smooth out variability
-# for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner
+            
+            model = LogisticRegression(C=Cval)
+            model.fit(results[0], results[2])
+            test_accuracies.append(model.score(results[1],results[3]))
 
-# TODO: your code here
+        plt.plot(train_percentages, test_accuracies)
+        test_accuracies=[]
 
-fig = plt.figure()
-plt.plot(train_percentages, test_accuracies)
+    # print '***** {}% *****'.format(p)
+    # print "Train accuracy %f" %model.score(X_train,y_train)
+    # print "Test accuracy %f"%model.score(X_test,y_test)
+
+#plt.plot(train_percentages, test_accuracies)
+plt.xlabel('Percentage of Data Used for Training')
+plt.ylabel('Accuracy on Test Set')
+
+fig2 = plt.figure()
+test_accuracies_average = []
+print len(X_train_total)
+print X_train_total[5]
+for p in train_percentages:
+    X_train = X_train_total[p]/num_trials
+    X_test = X_test_total[p]/num_trials
+    y_train = y_train_total[p]/num_trials
+    y_test = y_test_total[p]/num_trials
+
+    model = LogisticRegression(C=1.0)
+    model.fit(X_train, y_train)
+    test_accuracies_average.append(model.score(X_test,y_test))
+plt.plot(train_percentages, test_accuracies_average)
 plt.xlabel('Percentage of Data Used for Training')
 plt.ylabel('Accuracy on Test Set')
 plt.show()
+
diff --git a/questions.txt b/questions.txt
@@ -0,0 +1,15 @@
+1. What is the general trend in the curve?
+
+It seems inverse, and the bigger percent you use for training, the more accurate the results are.
+
+2. Are there parts of the curve that appear to be noisier than others?  Why?
+
+The first 1/2 (the middle part especially) seems way noisier than the end of the curve, where most of the set is used for training. 
+
+3. How many trials do you need to get a smooth curve?
+
+Well, I couldn't get a smooth curve, but by around 20 trials, you can see a clear trend in the graph that plots every trial.
+
+4. Try different values for C (by changing LogisticRegression(C=10**-10)).  What happens?  If you want to know why this happens, see this Wikipedia page as well as the documentation for LogisticRegression in scikit-learn.
+
+The lower the value, the noisier the data is. A C value of 1 generates a pretty clean line, whereas a C value of 10^-20 shows wild variation in the trials.