diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..35edf7e 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -8,7 +8,7 @@ data = load_digits() print data.DESCR -num_trials = 10 +num_trials = 100 train_percentages = range(5,95,5) test_accuracies = numpy.zeros(len(train_percentages)) @@ -16,11 +16,32 @@ # the resultant accuracy. # You should repeat each training percentage num_trials times to smooth out variability # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner +#result_data_train = [] +result_data_test = [] +for percentage in train_percentages: + n = 0 + #train_data = [] + test_data = [] + while n <= num_trials : + n = n + 1 + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size= percentage) + model = LogisticRegression(C=10**-10) + model.fit(X_train, y_train) + #model_score_train =model.score(X_train,y_train) + model_score_test = model.score(X_test,y_test) + # train_data.append(model_score_train) + test_data.append(model_score_test) # this is a list of floats + #train_ave= numpy.mean(train_data) + test_ave = numpy.mean(test_data) + #result_data_train.append(train_ave) + result_data_test.append(test_ave) + test_accuracies = result_data_test + -# TODO: your code here fig = plt.figure() -plt.plot(train_percentages, test_accuracies) +plt.plot(train_percentages, test_accuracies, label = 'testing data') plt.xlabel('Percentage of Data Used for Training') plt.ylabel('Accuracy on Test Set') +plt.legend() plt.show() diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..795f118 --- /dev/null +++ b/questions.txt @@ -0,0 +1,10 @@ +1. The general trend in the curve is upwards - the more data you are using for the training, the better the accuracy of + the test set is! +2. The small end of the curve (low percentage of data used for training) is very noisy because you are dealing with a +random set so the training may not reflect the actual characteristics of the letter well (in other words, you may have +outliers in the set of letters used for training when the percentage of characters used for training is low). +3. In order to get a smooth (not super spiky) curve, I tried 100 trials, which helped a lot but wasn't enough to reduce +all of the "spikiness"; at 1000 trials, the curve is reasonably smooth +4. As I try different values of C, the curve changes. Using 100 trials each, the curve at C=1**-10 is extemely smooth, +C=10**-10 is fairly "lumpy/spikey" (linear in smaller portions of the graph) and at C=100**-10 much more so (linear in +large portions of the graph). \ No newline at end of file