diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..ce5c7b1 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -8,16 +8,43 @@ data = load_digits() print data.DESCR -num_trials = 10 +num_trials = 50 train_percentages = range(5,95,5) -test_accuracies = numpy.zeros(len(train_percentages)) +# test_accuracies = numpy.zeros(len(train_percentages)) # train a model with training percentages between 5 and 90 (see train_percentages) and evaluate # the resultant accuracy. # You should repeat each training percentage num_trials times to smooth out variability # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner -# TODO: your code here +test_accuracies = [] + +model = LogisticRegression(C=10**-10) +for i in range(18): + percentage = 0.05 * (i+1) + train_total = 0 + test_total = 0 + for i in range(num_trials): + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=percentage) + model.fit(X_train, y_train) + train_total += model.score(X_train,y_train) + test_total += model.score(X_test,y_test) + train_average = train_total/num_trials + test_average = test_total/num_trials + print percentage + print "Train accuracy %f" %train_average + print "Test accuracy %f"%test_average + test_accuracies.append(test_average) + + +digits = load_digits() +print digits.DESCR +fig = plt.figure() +for i in range(10): + subplot = fig.add_subplot(5,2,i+1) + subplot.matshow(numpy.reshape(digits.data[i],(8,8)),cmap='gray') + +plt.show() fig = plt.figure() plt.plot(train_percentages, test_accuracies) diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..882b449 --- /dev/null +++ b/questions.txt @@ -0,0 +1,4 @@ +1.) Upward; generally, the more of the data is used in training, the more accurate the results are. +2.) Yes, the smaller values. Less training data leads to more inconsistent results. +3.) At num_trials = 50, the curve seems pretty consistently smooth. +4.) At larger values of C, the calculations take significantly longer, but test accuracy increases more steeply with percentage of training data. It seems to asymptotically approach 90% accuracy. At smaller values, the calculations are fast, but the accuracy is terrible.