From 603032baafbe116569d4296a3c8af3c62733181c Mon Sep 17 00:00:00 2001 From: John Moreland Date: Sun, 27 Mar 2016 14:36:55 -0400 Subject: [PATCH] Turning in my machine learning toolbox --- learning_curve.py | 15 +++++++++++++-- questions.txt | 7 +++++++ 2 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 questions.txt diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..a96703a 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -7,7 +7,7 @@ from sklearn.linear_model import LogisticRegression data = load_digits() -print data.DESCR +# print data.DESCR num_trials = 10 train_percentages = range(5,95,5) test_accuracies = numpy.zeros(len(train_percentages)) @@ -16,8 +16,19 @@ # the resultant accuracy. # You should repeat each training percentage num_trials times to smooth out variability # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner +test_accuracies=[] +for j in train_percentages: + train_size = j / 100.0 + cumulative_train = 0 + cumulative_test = 0 + for i in range(10): #do 10 trials + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=train_size) + model = LogisticRegression(C=10**-20) + model.fit(X_train, y_train) + cumulative_train = model.score(X_train,y_train) + cumulative_train + cumulative_test = model.score(X_test,y_test) + cumulative_test + test_accuracies.append(cumulative_test) -# TODO: your code here fig = plt.figure() plt.plot(train_percentages, test_accuracies) diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..8643686 --- /dev/null +++ b/questions.txt @@ -0,0 +1,7 @@ +1. It seems like a logarithmic graph - it is increasing but has a downward curve. + +2. It seems to be noisiest in the 40-60 range. I would imagine because this is the area where the training set becomes larger than the testing set. + +3. It seems to smooth out upwards of 75-100 trials. + +4. With a much larger C (10^-1, 10), the graph rises sharply in the 20-30% range, then mellows out a lot. With a very small C (10^-20), the graph shoots up quickly at the beginning then falls down after about 25%. \ No newline at end of file