diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..3f7d028 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -8,17 +8,30 @@ data = load_digits() print data.DESCR -num_trials = 10 -train_percentages = range(5,95,5) +num_trials = 100 +train_percentages = range(5, 95, 5) test_accuracies = numpy.zeros(len(train_percentages)) -# train a model with training percentages between 5 and 90 (see train_percentages) and evaluate -# the resultant accuracy. -# You should repeat each training percentage num_trials times to smooth out variability -# for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner +# train a model with training percentages between 5 and 90 (see +# train_percentages) and evaluate the resultant accuracy. +# You should repeat each training percentage num_trials times to smooth out +# variability for consistency with the previous example use +# model = LogisticRegression(C=10**-10) for your learner -# TODO: your code here +# create a model +model = LogisticRegression(C=10**-100) +# for loop so the things happen for the correct values enough times +for n in range(len(train_percentages)): + for i in range(num_trials): + # Split the data and train on some of it, store data correctly + X_train, X_test, y_train, y_test = train_test_split(data.data, + data.target, + train_size=train_percentages[n]) + model.fit(X_train, y_train) + test_accuracies[n] = test_accuracies[n] + model.score(X_test, y_test) + +# create plot and put the correct things on it fig = plt.figure() plt.plot(train_percentages, test_accuracies) plt.xlabel('Percentage of Data Used for Training') diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..575f0a9 --- /dev/null +++ b/questions.txt @@ -0,0 +1,7 @@ +1. The general trend in the curve is positive. + +2. At only 10 trials, almost the entire curve seems to be noisy. Typically, accuracy from training on 5% on the test is lower than on 90%, but everything between is pretty inconsistent. + +3. 100 trials makes a fairly smooth curve, but it's still pretty piecewise. + +4. The scale of accuracy on the test set gets smaller with a smaller C value and the entire curve becomes less smooth.