From 6896c5a19accd741e664a1aaf47ef4297c398e59 Mon Sep 17 00:00:00 2001 From: Prava Dhulipalla Date: Tue, 28 Mar 2017 23:26:51 -0400 Subject: [PATCH 1/2] Answering the necessary questions for Machine Learning Project Toolbox --- questions.txt | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 questions.txt diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..4eeb3a3 --- /dev/null +++ b/questions.txt @@ -0,0 +1,17 @@ +1. The curve demonstrates a positive correlation. As the percentage of training +data used increases, the accuracy on the test set also increases. This is because +with more training data, the computer can more accurately determine what to do +with 'random' values in the test set. + +2. The noise is more apparently when a lower percentage of the data was used for +training. This is probably because when there is a little training, the program +can probably 'narrow' down to the right values but will be forced to 'choose' within +a small subset, and thus sometimes get it right (so it is more accurate) and +sometimes get it wrong (less accurate). + +3. The curve is smooth around 100 trials or so. + +4. Varying C determines the percentage of necessary training data needed to have +a higher accuracy on the test data. A higher C means less training data needed for +the same accuracy, and a smaller C value means that it need more training data to +reach the same accuracy threshhold. From 326fdad09f4b794dd337f6bbbd44132d8ffdc720 Mon Sep 17 00:00:00 2001 From: Prava Dhulipalla Date: Tue, 28 Mar 2017 23:27:47 -0400 Subject: [PATCH 2/2] This is the main file for the Machine Learning Project Toolbox --- learning_curve.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/learning_curve.py b/learning_curve.py index fdce500..97eccef 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -20,7 +20,7 @@ def display_digits(): def train_model(): data = load_digits() - num_trials = 10 + num_trials = 100 train_percentages = range(5, 95, 5) test_accuracies = numpy.zeros(len(train_percentages)) @@ -31,16 +31,29 @@ def train_model(): # For consistency with the previous example use # model = LogisticRegression(C=10**-10) for your learner - # TODO: your code here + # for every number in the range of percentages, creates a tuple + for i, size in enumerate(train_percentages): + accuracy = [] + for j in range(num_trials): + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=size/100) + + # part of the code that does the 'machine learning' part + model = LogisticRegression(C=10**-10) + model.fit(X_train, y_train) + accuracy.append(model.score(X_test, y_test)) + + # finds the average accuracy + test_accuracies[i] = numpy.mean(accuracy) + + # plots a graph of percentage of data used vs accuracy on test set fig = plt.figure() plt.plot(train_percentages, test_accuracies) plt.xlabel('Percentage of Data Used for Training') plt.ylabel('Accuracy on Test Set') plt.show() - if __name__ == "__main__": # Feel free to comment/uncomment as needed - display_digits() - # train_model() + # display_digits() + train_model()