diff --git a/learning_curve.py b/learning_curve.py index fdce500..97eccef 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -20,7 +20,7 @@ def display_digits(): def train_model(): data = load_digits() - num_trials = 10 + num_trials = 100 train_percentages = range(5, 95, 5) test_accuracies = numpy.zeros(len(train_percentages)) @@ -31,16 +31,29 @@ def train_model(): # For consistency with the previous example use # model = LogisticRegression(C=10**-10) for your learner - # TODO: your code here + # for every number in the range of percentages, creates a tuple + for i, size in enumerate(train_percentages): + accuracy = [] + for j in range(num_trials): + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=size/100) + + # part of the code that does the 'machine learning' part + model = LogisticRegression(C=10**-10) + model.fit(X_train, y_train) + accuracy.append(model.score(X_test, y_test)) + + # finds the average accuracy + test_accuracies[i] = numpy.mean(accuracy) + + # plots a graph of percentage of data used vs accuracy on test set fig = plt.figure() plt.plot(train_percentages, test_accuracies) plt.xlabel('Percentage of Data Used for Training') plt.ylabel('Accuracy on Test Set') plt.show() - if __name__ == "__main__": # Feel free to comment/uncomment as needed - display_digits() - # train_model() + # display_digits() + train_model() diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..4eeb3a3 --- /dev/null +++ b/questions.txt @@ -0,0 +1,17 @@ +1. The curve demonstrates a positive correlation. As the percentage of training +data used increases, the accuracy on the test set also increases. This is because +with more training data, the computer can more accurately determine what to do +with 'random' values in the test set. + +2. The noise is more apparently when a lower percentage of the data was used for +training. This is probably because when there is a little training, the program +can probably 'narrow' down to the right values but will be forced to 'choose' within +a small subset, and thus sometimes get it right (so it is more accurate) and +sometimes get it wrong (less accurate). + +3. The curve is smooth around 100 trials or so. + +4. Varying C determines the percentage of necessary training data needed to have +a higher accuracy on the test data. A higher C means less training data needed for +the same accuracy, and a smaller C value means that it need more training data to +reach the same accuracy threshhold.