From d0560e1eea769aca8eee4e30aeca6536b82809ba Mon Sep 17 00:00:00 2001
From: hmowen <harper.owen@students.olin.edu>
Date: Sun, 27 Mar 2016 22:51:57 -0400
Subject: [PATCH] Submission for Machine Learning Toolbox

---
 learning_curve.py | 12 ++++++++++--
 questions.txt     |  5 +++++
 2 files changed, 15 insertions(+), 2 deletions(-)
 create mode 100644 questions.txt

diff --git a/learning_curve.py b/learning_curve.py
index 2364f2c..58e00bf 100644
--- a/learning_curve.py
+++ b/learning_curve.py
@@ -8,7 +8,7 @@
 
 data = load_digits()
 print data.DESCR
-num_trials = 10
+num_trials = 2500
 train_percentages = range(5,95,5)
 test_accuracies = numpy.zeros(len(train_percentages))
 
@@ -17,7 +17,15 @@
 # You should repeat each training percentage num_trials times to smooth out variability
 # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner
 
-# TODO: your code here
+for i in range(len(train_percentages)):
+	trial_accuracies = numpy.zeros(num_trials)
+	for trial in range(num_trials):
+		X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=train_percentages[i])
+		model = LogisticRegression(C=10**-10)
+		model.fit(X_train, y_train)
+		trial_accuracies[trial] = model.score(X_test, y_test)
+	test_accuracies[i] = sum(trial_accuracies) / num_trials
+
 
 fig = plt.figure()
 plt.plot(train_percentages, test_accuracies)
diff --git a/questions.txt b/questions.txt
new file mode 100644
index 0000000..819731e
--- /dev/null
+++ b/questions.txt
@@ -0,0 +1,5 @@
+
+1. The general trend of the curve is positive correlation
+2. There is more noise towards the two ends of the curve, probably because there is too much imbalance either way.
+3. The graph is fairly smooth around 3000 trials
+4. As C increases the graph becomes more of a curve than a line, appearing to hit some sort of asymptote.
\ No newline at end of file