From fb05e5e1232934fb9f48eb3f20a92166a4f5e9ce Mon Sep 17 00:00:00 2001
From: Colvchap <colvin.chapman@students.olin.edu>
Date: Wed, 26 Apr 2017 18:16:53 -0400
Subject: [PATCH] finished toolbox

---
 learning_curve.py | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/learning_curve.py b/learning_curve.py
index 2baa81b..4ee4fe8 100644
--- a/learning_curve.py
+++ b/learning_curve.py
@@ -1,4 +1,5 @@
-"""Explore learning curves for classification of handwritten digits"""
+"""Explore learning curves for classification of handwritten digits
+@author Colvin """
 
 import matplotlib.pyplot as plt
 import numpy
@@ -21,15 +22,16 @@ def display_digits():
 
 def train_model():
     """Train a model on pictures of digits.
-    
+
     Read in 8x8 pictures of numbers and evaluate the accuracy of the model
     when different percentages of the data are used as training data. This function
     plots the average accuracy of the model as a function of the percent of data
     used to train it.
     """
     data = load_digits()
-    num_trials = 10
+    num_trials = 75
     train_percentages = range(5, 95, 5)
+    all_tests = numpy.zeros(len(train_percentages))
     test_accuracies = numpy.zeros(len(train_percentages))
 
     # train models with training percentages between 5 and 90 (see
@@ -38,11 +40,29 @@ def train_model():
     # variability.
     # For consistency with the previous example use
     # model = LogisticRegression(C=10**-10) for your learner
+    for test in range(num_trials):
+        i = 0
+        for percentage in train_percentages:
+            train_size = .01*percentage
+
+            X_train, X_test, y_train, y_test = train_test_split(data.data, data.target,
+                                                                train_size = train_size)
+            model = LogisticRegression(C=100)
+
+            model.fit(X_train, y_train)
+            # print("Train accuracy %f" %model.score(X_train, y_train))
+            # print("Test accuracy %f"%model.score(X_test, y_test))
+
+            test_accuracies[i] = model.score(X_test, y_test)
+            i += 1
 
-    # TODO: your code here
+        all_tests = numpy.vstack((all_tests, test_accuracies))    # extending output matrix
 
+    final_test_accuracies = numpy.mean(all_tests, axis=0)
+    print(final_test_accuracies)
     fig = plt.figure()
-    plt.plot(train_percentages, test_accuracies)
+    print(train_percentages, final_test_accuracies)
+    plt.plot(train_percentages, final_test_accuracies)
     plt.xlabel('Percentage of Data Used for Training')
     plt.ylabel('Accuracy on Test Set')
     plt.show()
@@ -50,5 +70,5 @@ def train_model():
 
 if __name__ == "__main__":
     # Feel free to comment/uncomment as needed
-    display_digits()
-    # train_model()
+    #display_digits()
+    train_model()