diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..b251783 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -7,7 +7,6 @@ from sklearn.linear_model import LogisticRegression data = load_digits() -print data.DESCR num_trials = 10 train_percentages = range(5,95,5) test_accuracies = numpy.zeros(len(train_percentages)) @@ -15,9 +14,25 @@ # train a model with training percentages between 5 and 90 (see train_percentages) and evaluate # the resultant accuracy. # You should repeat each training percentage num_trials times to smooth out variability -# for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner +# for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learneron + +for j in range(len(train_percentages)): + + total_accuracy = 0 + + for i in range(num_trials): + + t = train_percentages[j] / 100.0 #converts percentage into decimal + + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size = t) + model = LogisticRegression(C=10**-10) + model.fit(X_train, y_train) + total_accuracy += model.score(X_test,y_test) + + avg_accuracy = total_accuracy / num_trials + + test_accuracies[j] = avg_accuracy -# TODO: your code here fig = plt.figure() plt.plot(train_percentages, test_accuracies) diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..4f3f8cd --- /dev/null +++ b/questions.txt @@ -0,0 +1,4 @@ +1. The curve generally increases and is fairly linear. +2. The beginning part of the curve generally seems to be a lot noisier than the end of the curve. This is probably due to the fact that when the computer is only learning from 5% of the data, it is basically guessing for the entirety of the testing set of numbers, which leads to inaccurate data. +3. I got a fairly smooth curve with about 100 trials. 70 trials was still a bit bumpy. With 200 trials, there were still some bumps at the beginning of the curve. +4. When I changed C to 1/10, the graph appeared to be more exponential than linear. The test accuracy numbers were much higher for all percentages. The graph was also fairly smooth. When I changed C to 10^(-15), the numbers were about the same as 10^(-10), but the graph was much bumpier.