From 6896c5a19accd741e664a1aaf47ef4297c398e59 Mon Sep 17 00:00:00 2001
From: Prava Dhulipalla <prava@students.olin.edu>
Date: Tue, 28 Mar 2017 23:26:51 -0400
Subject: [PATCH 1/2] Answering the necessary questions for Machine Learning
 Project Toolbox

---
 questions.txt | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 questions.txt

diff --git a/questions.txt b/questions.txt
new file mode 100644
index 0000000..4eeb3a3
--- /dev/null
+++ b/questions.txt
@@ -0,0 +1,17 @@
+1. The curve demonstrates a positive correlation. As the percentage of training
+data used increases, the accuracy on the test set also increases. This is because
+with more training data, the computer can more accurately determine what to do
+with 'random' values in the test set.
+
+2. The noise is more apparently when a lower percentage of the data was used for
+training. This is probably because when there is a little training, the program
+can probably 'narrow' down to the right values but will be forced to 'choose' within
+a small subset, and thus sometimes get it right (so it is more accurate) and
+sometimes get it wrong (less accurate).
+
+3. The curve is smooth around 100 trials or so.
+
+4. Varying C determines the percentage of necessary training data needed to have
+a higher accuracy on the test data. A higher C means less training data needed for
+the same accuracy, and a smaller C value means that it need more training data to
+reach the same accuracy threshhold.

From 326fdad09f4b794dd337f6bbbd44132d8ffdc720 Mon Sep 17 00:00:00 2001
From: Prava Dhulipalla <prava@students.olin.edu>
Date: Tue, 28 Mar 2017 23:27:47 -0400
Subject: [PATCH 2/2] This is the main file for the Machine Learning Project
 Toolbox

---
 learning_curve.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/learning_curve.py b/learning_curve.py
index fdce500..97eccef 100644
--- a/learning_curve.py
+++ b/learning_curve.py
@@ -20,7 +20,7 @@ def display_digits():
 
 def train_model():
     data = load_digits()
-    num_trials = 10
+    num_trials = 100
     train_percentages = range(5, 95, 5)
     test_accuracies = numpy.zeros(len(train_percentages))
 
@@ -31,16 +31,29 @@ def train_model():
     # For consistency with the previous example use
     # model = LogisticRegression(C=10**-10) for your learner
 
-    # TODO: your code here
+    # for every number in the range of percentages, creates a tuple
+    for i, size in enumerate(train_percentages):
+        accuracy = []
 
+        for j in range(num_trials):
+            X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=size/100)
+
+            # part of the code that does the 'machine learning' part
+            model = LogisticRegression(C=10**-10)
+            model.fit(X_train, y_train)
+            accuracy.append(model.score(X_test, y_test))
+
+        # finds the average accuracy
+        test_accuracies[i] = numpy.mean(accuracy)
+
+    # plots a graph of percentage of data used vs accuracy on test set
     fig = plt.figure()
     plt.plot(train_percentages, test_accuracies)
     plt.xlabel('Percentage of Data Used for Training')
     plt.ylabel('Accuracy on Test Set')
     plt.show()
 
-
 if __name__ == "__main__":
     # Feel free to comment/uncomment as needed
-    display_digits()
-    # train_model()
+    # display_digits()
+    train_model()