diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..a8e9f31 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -7,8 +7,8 @@ from sklearn.linear_model import LogisticRegression data = load_digits() -print data.DESCR -num_trials = 10 +# print data.DESCR +num_trials = 100 train_percentages = range(5,95,5) test_accuracies = numpy.zeros(len(train_percentages)) @@ -17,10 +17,18 @@ # You should repeat each training percentage num_trials times to smooth out variability # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner -# TODO: your code here +for i in range(len(train_percentages)): + trial_accuracies = numpy.zeros(num_trials) + for trial in range(num_trials): + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=train_percentages[i]) + model = LogisticRegression(C=10**-3) + model.fit(X_train, y_train) + trial_accuracies[trial] = model.score(X_test,y_test) + test_accuracies[i] = sum(trial_accuracies) / num_trials + fig = plt.figure() plt.plot(train_percentages, test_accuracies) plt.xlabel('Percentage of Data Used for Training') plt.ylabel('Accuracy on Test Set') -plt.show() +plt.show() \ No newline at end of file diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..35d0e94 --- /dev/null +++ b/questions.txt @@ -0,0 +1,11 @@ +1. What is the general trend in the curve? +The accuracy of the test generally increases as the percentage of the data used to train the machine increases. + +2. Are there parts of the curve that appear to be noisier than others? Why? +The curve seems to be consistently noisy at 10 trials. + +3. How many trials do you need to get a smooth curve? +The curve started to smooth out at 2000 trials per percentage. + +4. Try different values for C (by changing LogisticRegression(C=10**-10)). What happens? If you want to know why this happens, see this Wikipedia page as well as the documentation for LogisticRegression in scikit-learn. +As C gets larger, the noise in the curve decreases. \ No newline at end of file