From c888736f7349504f9b465e24974ea710da5d5154 Mon Sep 17 00:00:00 2001 From: JosephLee19 Date: Sat, 26 Mar 2016 23:57:40 -0400 Subject: [PATCH 1/2] Questions for Machine Learning Toolbox --- questions.txt | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 questions.txt diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..c95cbb8 --- /dev/null +++ b/questions.txt @@ -0,0 +1,9 @@ + There is a general upwards trend in the curve...i.e. as the training set gets larger, the results are mor accurate + + It seems particularly noisy when the training set is around 80-90%...probably because at that point the test set is so small that + a few oddball results really impact the rest of the curve + + I achieved a readably smooth curve around 50 trials. At 500 trials, the curve was much smoother. + + I didn't notice much difference other than it seemed to change the amount of noise. + I am not completely sure if this is what I should be seeing or not though \ No newline at end of file From 4a73ea91bdb7dc2f64ee0654e81a5ae9671c6104 Mon Sep 17 00:00:00 2001 From: JosephLee19 Date: Sat, 26 Mar 2016 23:58:06 -0400 Subject: [PATCH 2/2] Completed code for Machine Learning Toolbox --- learning_curve.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..3ed1076 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -1,5 +1,3 @@ -""" Exploring learning curves for classification of handwritten digits """ - import matplotlib.pyplot as plt import numpy from sklearn.datasets import * @@ -8,19 +6,26 @@ data = load_digits() print data.DESCR -num_trials = 10 -train_percentages = range(5,95,5) -test_accuracies = numpy.zeros(len(train_percentages)) - +num_trials = 500 +train_percentages = range(5,95,1) +test_accuracies = [] + # train a model with training percentages between 5 and 90 (see train_percentages) and evaluate # the resultant accuracy. # You should repeat each training percentage num_trials times to smooth out variability # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner -# TODO: your code here +for train_percentage in train_percentages: # For each percentage in train_percentages (goes from 5% to 95% in increments of 1%) + running_average_variable = 0 + for i in range(0,num_trials): # Run the number of trials specified + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size = train_percentage) + model = LogisticRegression(C=10**-10) + model.fit(X_train, y_train) + running_average_variable += model.score(X_test, y_test) # Add up all of the running averages + test_accuracies.append(running_average_variable/num_trials) # divide running average by the number of trials to attain actual average fig = plt.figure() plt.plot(train_percentages, test_accuracies) plt.xlabel('Percentage of Data Used for Training') plt.ylabel('Accuracy on Test Set') -plt.show() +plt.show() \ No newline at end of file