diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..3ed1076 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -1,5 +1,3 @@ -""" Exploring learning curves for classification of handwritten digits """ - import matplotlib.pyplot as plt import numpy from sklearn.datasets import * @@ -8,19 +6,26 @@ data = load_digits() print data.DESCR -num_trials = 10 -train_percentages = range(5,95,5) -test_accuracies = numpy.zeros(len(train_percentages)) - +num_trials = 500 +train_percentages = range(5,95,1) +test_accuracies = [] + # train a model with training percentages between 5 and 90 (see train_percentages) and evaluate # the resultant accuracy. # You should repeat each training percentage num_trials times to smooth out variability # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner -# TODO: your code here +for train_percentage in train_percentages: # For each percentage in train_percentages (goes from 5% to 95% in increments of 1%) + running_average_variable = 0 + for i in range(0,num_trials): # Run the number of trials specified + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size = train_percentage) + model = LogisticRegression(C=10**-10) + model.fit(X_train, y_train) + running_average_variable += model.score(X_test, y_test) # Add up all of the running averages + test_accuracies.append(running_average_variable/num_trials) # divide running average by the number of trials to attain actual average fig = plt.figure() plt.plot(train_percentages, test_accuracies) plt.xlabel('Percentage of Data Used for Training') plt.ylabel('Accuracy on Test Set') -plt.show() +plt.show() \ No newline at end of file diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..c95cbb8 --- /dev/null +++ b/questions.txt @@ -0,0 +1,9 @@ + There is a general upwards trend in the curve...i.e. as the training set gets larger, the results are mor accurate + + It seems particularly noisy when the training set is around 80-90%...probably because at that point the test set is so small that + a few oddball results really impact the rest of the curve + + I achieved a readably smooth curve around 50 trials. At 500 trials, the curve was much smoother. + + I didn't notice much difference other than it seemed to change the amount of noise. + I am not completely sure if this is what I should be seeing or not though \ No newline at end of file