From bf99edeebf851fee5e5671d084303f7164b68c1b Mon Sep 17 00:00:00 2001 From: Linnea Laux Date: Sun, 27 Mar 2016 21:22:03 -0400 Subject: [PATCH] Last toolbox yayy --- learning_curve.py | 15 ++++++++------- questions.txt | 4 ++++ 2 files changed, 12 insertions(+), 7 deletions(-) create mode 100644 questions.txt diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..b382b7d 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -7,20 +7,21 @@ from sklearn.linear_model import LogisticRegression data = load_digits() -print data.DESCR -num_trials = 10 +num_trials = 100 train_percentages = range(5,95,5) test_accuracies = numpy.zeros(len(train_percentages)) -# train a model with training percentages between 5 and 90 (see train_percentages) and evaluate -# the resultant accuracy. -# You should repeat each training percentage num_trials times to smooth out variability -# for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner +for i in range(len(train_percentages)): + for j in range(num_trials): + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=(train_percentages[i])) + model = LogisticRegression(C=5**-5) + model.fit(X_train, y_train) + test_accuracies[i] = test_accuracies[i]+model.score(X_test,y_test) -# TODO: your code here fig = plt.figure() plt.plot(train_percentages, test_accuracies) plt.xlabel('Percentage of Data Used for Training') plt.ylabel('Accuracy on Test Set') plt.show() + diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..f76c4d7 --- /dev/null +++ b/questions.txt @@ -0,0 +1,4 @@ +1. The general trend is upward. +2. The beginning and end of the curve appear to be noisiest. This is likely because testing and training both require a certain amount of data to work well, so when almost all the data is being used for one or the other the curve can get messed up. +3.The curve was pretty smooth around 5000 trials. A good way to smooth the curve even more at large numbers of trials was to change the step size in the percentages from 5 to 1. +4. Higher C values emphasize noise, and lower C values smooth. \ No newline at end of file