From 7ea04fab9f9158cb607b348194b9155e8367fc46 Mon Sep 17 00:00:00 2001 From: David Papp Date: Sun, 20 Mar 2016 16:14:38 -0400 Subject: [PATCH 1/2] Updated learning_curve.py --- learning_curve.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..e4bfee5 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -6,11 +6,24 @@ from sklearn.cross_validation import train_test_split from sklearn.linear_model import LogisticRegression + data = load_digits() print data.DESCR -num_trials = 10 + +num_trials = 100 train_percentages = range(5,95,5) -test_accuracies = numpy.zeros(len(train_percentages)) +test_accuracies = [] +for i in train_percentages: + avg_test_accuracy = 0 + for j in range(0, num_trials): + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=i / 100.0) + model = LogisticRegression(C=10**-3) + model.fit(X_train, y_train) + avg_test_accuracy += model.score(X_test, y_test) + avg_test_accuracy /= num_trials + print i + print "Test accuracy %f"%avg_test_accuracy + test_accuracies.append(avg_test_accuracy) # train a model with training percentages between 5 and 90 (see train_percentages) and evaluate # the resultant accuracy. @@ -18,7 +31,6 @@ # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner # TODO: your code here - fig = plt.figure() plt.plot(train_percentages, test_accuracies) plt.xlabel('Percentage of Data Used for Training') From a064f9976de47477cd3b745027ba175728e70cb2 Mon Sep 17 00:00:00 2001 From: David Papp Date: Sun, 20 Mar 2016 16:14:50 -0400 Subject: [PATCH 2/2] Answered questions --- questions.txt | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 questions.txt diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..2eb6b71 --- /dev/null +++ b/questions.txt @@ -0,0 +1,4 @@ +1. The general trend in the curve is upwards, with some amount of diminishing returns at higher percentages of data used for training. +2. The lower percentages of data used for training tends to result in more noise. This makes sense because with less training data, the model is likely to have more fluctuations in accuracy. +3. 1000 trials produces a decently smooth curve. +4. As C increases, the accuracy values all increase since the acceptable threshold decreases (C is the inverse of regularization strength). The graph also acquires a more curved shape, suggesting that the rate of diminishing returns increases as C increases. In other words, the first few percentage points of increase for low values of C matter less than for they do for high values of C.