diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..776fa10 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -1,4 +1,4 @@ -""" Exploring learning curves for classification of handwritten digits """ +""" Exploring learning curves for classification of handwritten digits.""" import matplotlib.pyplot as plt import numpy @@ -6,18 +6,30 @@ from sklearn.cross_validation import train_test_split from sklearn.linear_model import LogisticRegression + data = load_digits() print data.DESCR num_trials = 10 train_percentages = range(5,95,5) test_accuracies = numpy.zeros(len(train_percentages)) -# train a model with training percentages between 5 and 90 (see train_percentages) and evaluate -# the resultant accuracy. -# You should repeat each training percentage num_trials times to smooth out variability -# for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner -# TODO: your code here +for i in range(len(train_percentages)): + + training = train_percentages[i]/100.0 + results = [] + + for j in range(num_trials): + + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size= training) + model = LogisticRegression(C=15**-10) + model.fit(X_train, y_train) + score = model.score(X_test,y_test) + + results.append(score) + + test_accuracies[i] = numpy.average(results) + fig = plt.figure() plt.plot(train_percentages, test_accuracies) diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..d232601 --- /dev/null +++ b/questions.txt @@ -0,0 +1,8 @@ +The general trend of the curve is upward. It increases more rapidly over lower percentages of data used, and increases less rapidly over higher percentages of data used. + +The curve appears to have noise between 30-60 percent of data used fairly consistently. I think this may have to do with the amount of data available for learning when compared to the amount available for testing. + +Around 100 trials are required for a decently smooth curve. Upwards of 400 trials give the curve a much smoother finish. + +It appears that the C value of 10 smooths the curve out so that it increases in a more linear progression. A graph with a C value of 2 looks very logarithmic. It also starts with an accuracy around 0.82. +