From d73d6c4cfad6d553c42cf76d1006ea34b85a41b8 Mon Sep 17 00:00:00 2001
From: Jeremy Garcia <jeremiah.garcia@students.olin.edu>
Date: Sat, 19 Mar 2016 16:46:42 -0400
Subject: [PATCH] turning in project toolbox - machine learning

---
 learning_curve.py | 10 +++++++---
 questions.txt     |  7 +++++++
 questions.txt~    |  7 +++++++
 3 files changed, 21 insertions(+), 3 deletions(-)
 create mode 100644 questions.txt
 create mode 100644 questions.txt~

diff --git a/learning_curve.py b/learning_curve.py
index 2364f2c..5a02e2c 100644
--- a/learning_curve.py
+++ b/learning_curve.py
@@ -8,7 +8,7 @@
 
 data = load_digits()
 print data.DESCR
-num_trials = 10
+num_trials = 100
 train_percentages = range(5,95,5)
 test_accuracies = numpy.zeros(len(train_percentages))
 
@@ -16,8 +16,12 @@
 # the resultant accuracy.
 # You should repeat each training percentage num_trials times to smooth out variability
 # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner
-
-# TODO: your code here
+for i in range (len(train_percentages)):
+	for j in range(num_trials):
+		X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=train_percentages[i])
+		model = LogisticRegression(C=10**1)
+		model.fit(X_train, y_train)
+		test_accuracies[i] += model.score(X_test,y_test)
 
 fig = plt.figure()
 plt.plot(train_percentages, test_accuracies)
diff --git a/questions.txt b/questions.txt
new file mode 100644
index 0000000..6b498ee
--- /dev/null
+++ b/questions.txt
@@ -0,0 +1,7 @@
+1. Generally, as the percentage of data used increases, so does the accuracy on the test set.
+
+2. It seemed to me that the whole curve appeared to be noisy, especially at a low number of trials. There didn't seem to be an area that was nosier than others.
+
+3. After around 700 or 800 trials, the curve starts to become smooth. 
+
+4. As C gets larger, the graph becomes smoother, and as C gets smaller, the graph becomes very noisy. 
diff --git a/questions.txt~ b/questions.txt~
new file mode 100644
index 0000000..6b498ee
--- /dev/null
+++ b/questions.txt~
@@ -0,0 +1,7 @@
+1. Generally, as the percentage of data used increases, so does the accuracy on the test set.
+
+2. It seemed to me that the whole curve appeared to be noisy, especially at a low number of trials. There didn't seem to be an area that was nosier than others.
+
+3. After around 700 or 800 trials, the curve starts to become smooth. 
+
+4. As C gets larger, the graph becomes smoother, and as C gets smaller, the graph becomes very noisy.