From c888736f7349504f9b465e24974ea710da5d5154 Mon Sep 17 00:00:00 2001
From: JosephLee19 <leejos17@gmail.com>
Date: Sat, 26 Mar 2016 23:57:40 -0400
Subject: [PATCH 1/2] Questions for Machine Learning Toolbox

---
 questions.txt | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 questions.txt

diff --git a/questions.txt b/questions.txt
new file mode 100644
index 0000000..c95cbb8
--- /dev/null
+++ b/questions.txt
@@ -0,0 +1,9 @@
+    There is a general upwards trend in the curve...i.e. as the training set gets larger, the results are mor accurate
+
+    It seems particularly noisy when the training set is around 80-90%...probably because at that point the test set is so small that
+    a few oddball results really impact the rest of the curve
+
+    I achieved a readably smooth curve around 50 trials.  At 500 trials, the curve was much smoother.
+
+    I didn't notice much difference other than it seemed to change the amount of noise.
+    I am not completely sure if this is what I should be seeing or not though
\ No newline at end of file

From 4a73ea91bdb7dc2f64ee0654e81a5ae9671c6104 Mon Sep 17 00:00:00 2001
From: JosephLee19 <leejos17@gmail.com>
Date: Sat, 26 Mar 2016 23:58:06 -0400
Subject: [PATCH 2/2] Completed code for Machine Learning Toolbox

---
 learning_curve.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/learning_curve.py b/learning_curve.py
index 2364f2c..3ed1076 100644
--- a/learning_curve.py
+++ b/learning_curve.py
@@ -1,5 +1,3 @@
-""" Exploring learning curves for classification of handwritten digits """
-
 import matplotlib.pyplot as plt
 import numpy
 from sklearn.datasets import *
@@ -8,19 +6,26 @@
 
 data = load_digits()
 print data.DESCR
-num_trials = 10
-train_percentages = range(5,95,5)
-test_accuracies = numpy.zeros(len(train_percentages))
-
+num_trials = 500
+train_percentages = range(5,95,1)
+test_accuracies = []
+ 
 # train a model with training percentages between 5 and 90 (see train_percentages) and evaluate
 # the resultant accuracy.
 # You should repeat each training percentage num_trials times to smooth out variability
 # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner
 
-# TODO: your code here
+for train_percentage in train_percentages: # For each percentage in train_percentages (goes from 5% to 95% in increments of 1%)
+    running_average_variable = 0
+    for i in range(0,num_trials): # Run the number of trials specified
+       X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size = train_percentage)
+       model = LogisticRegression(C=10**-10)
+       model.fit(X_train, y_train)
+       running_average_variable += model.score(X_test, y_test) # Add up all of the running averages
+    test_accuracies.append(running_average_variable/num_trials) # divide running average by the number of trials to attain actual average
 
 fig = plt.figure()
 plt.plot(train_percentages, test_accuracies)
 plt.xlabel('Percentage of Data Used for Training')
 plt.ylabel('Accuracy on Test Set')
-plt.show()
+plt.show() 
\ No newline at end of file