diff --git a/figure_1.png b/figure_1.png new file mode 100644 index 0000000..003410a Binary files /dev/null and b/figure_1.png differ diff --git a/figure_2.png b/figure_2.png new file mode 100644 index 0000000..a2df279 Binary files /dev/null and b/figure_2.png differ diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..65be11b 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -7,17 +7,21 @@ from sklearn.linear_model import LogisticRegression data = load_digits() -print data.DESCR -num_trials = 10 -train_percentages = range(5,95,5) -test_accuracies = numpy.zeros(len(train_percentages)) +num_trials = 100 +train_percentages = range(5,90,5) +test_accuracies = [] #numpy.zeros(len(train_percentages)) -# train a model with training percentages between 5 and 90 (see train_percentages) and evaluate -# the resultant accuracy. -# You should repeat each training percentage num_trials times to smooth out variability -# for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner - -# TODO: your code here +for i in train_percentages: #runs through different percentages + list_of_n = [] + for n in range(num_trials): #runs a given number of times --> average data + #Partition data into two sets: training and testing. + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=i) + model = LogisticRegression(C=10**-10)#10**-10) #use the Multinomial Logistic Regression algorithm + model.fit(X_train, y_train) + accuracy_of_n = model.score(X_test,y_test) #find accuracy + list_of_n.append(accuracy_of_n) #add each new score to list + average_n = sum(list_of_n)/len(list_of_n) #average list + test_accuracies.append(average_n) #append average to test_accuracies list fig = plt.figure() plt.plot(train_percentages, test_accuracies) diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..e688c74 --- /dev/null +++ b/questions.txt @@ -0,0 +1,7 @@ +1. The general trend of the curve is up. + +2. The middle of the curve is the noisiest. I think this is because when the training percentage is low/high, the machine is more likely to get a definitively low/high answer: It either doesn't have nearly enough information or it has enough to make a good guess. When the training percentage is somewhere in the middle, the machine's success is more variable, because sometimes it does have enough information and sometimes it doesn't. This results in more noise in the center of the graph, between about 30% and 50%. + +3. It takes about 1000 trials to get a smooth-ish curve. + +4. When I increase the value of C, the curve looks much better. The lower the value of C, the noisier the curve. \ No newline at end of file