diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..ea968ee 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -7,20 +7,21 @@ from sklearn.linear_model import LogisticRegression data = load_digits() -print data.DESCR -num_trials = 10 +num_trials = 100 train_percentages = range(5,95,5) -test_accuracies = numpy.zeros(len(train_percentages)) +test_accuracies = [] # numpy.zeros(len(train_percentages)) -# train a model with training percentages between 5 and 90 (see train_percentages) and evaluate -# the resultant accuracy. -# You should repeat each training percentage num_trials times to smooth out variability -# for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner - -# TODO: your code here +for n in train_percentages: # For each number within train_percentages + average_test = 0 + for i in range(0,num_trials): # Run each percentage num_trials times + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size = n) + model = LogisticRegression(C=10**-10) + model.fit(X_train, y_train) + average_test += model.score(X_test, y_test) # Take average of results + test_accuracies.append(average_test/num_trials) # append average accuracy to test_accuracies fig = plt.figure() plt.plot(train_percentages, test_accuracies) plt.xlabel('Percentage of Data Used for Training') plt.ylabel('Accuracy on Test Set') -plt.show() +plt.show() \ No newline at end of file diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..c674e3e --- /dev/null +++ b/questions.txt @@ -0,0 +1,4 @@ +1. The general trend of the curve is upwards. +2. Yes, the beginning of the graph tends to be noisier than the rest of the graph. I believe this occurs because the computer is trying to test with only a small portion of the data which would cause a lot of variability in the accuracy of predictions. +3. Around 1000 trials gave me a smooth curve. +4. Higher values of C gave me a smother curve that typically make an increasing graph with negative concavity. Lower values of C made the graph much more noisy.