From d73d6c4cfad6d553c42cf76d1006ea34b85a41b8 Mon Sep 17 00:00:00 2001 From: Jeremy Garcia Date: Sat, 19 Mar 2016 16:46:42 -0400 Subject: [PATCH] turning in project toolbox - machine learning --- learning_curve.py | 10 +++++++--- questions.txt | 7 +++++++ questions.txt~ | 7 +++++++ 3 files changed, 21 insertions(+), 3 deletions(-) create mode 100644 questions.txt create mode 100644 questions.txt~ diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..5a02e2c 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -8,7 +8,7 @@ data = load_digits() print data.DESCR -num_trials = 10 +num_trials = 100 train_percentages = range(5,95,5) test_accuracies = numpy.zeros(len(train_percentages)) @@ -16,8 +16,12 @@ # the resultant accuracy. # You should repeat each training percentage num_trials times to smooth out variability # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner - -# TODO: your code here +for i in range (len(train_percentages)): + for j in range(num_trials): + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=train_percentages[i]) + model = LogisticRegression(C=10**1) + model.fit(X_train, y_train) + test_accuracies[i] += model.score(X_test,y_test) fig = plt.figure() plt.plot(train_percentages, test_accuracies) diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..6b498ee --- /dev/null +++ b/questions.txt @@ -0,0 +1,7 @@ +1. Generally, as the percentage of data used increases, so does the accuracy on the test set. + +2. It seemed to me that the whole curve appeared to be noisy, especially at a low number of trials. There didn't seem to be an area that was nosier than others. + +3. After around 700 or 800 trials, the curve starts to become smooth. + +4. As C gets larger, the graph becomes smoother, and as C gets smaller, the graph becomes very noisy. diff --git a/questions.txt~ b/questions.txt~ new file mode 100644 index 0000000..6b498ee --- /dev/null +++ b/questions.txt~ @@ -0,0 +1,7 @@ +1. Generally, as the percentage of data used increases, so does the accuracy on the test set. + +2. It seemed to me that the whole curve appeared to be noisy, especially at a low number of trials. There didn't seem to be an area that was nosier than others. + +3. After around 700 or 800 trials, the curve starts to become smooth. + +4. As C gets larger, the graph becomes smoother, and as C gets smaller, the graph becomes very noisy.