diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..a61afbc 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -1,26 +1,88 @@ """ Exploring learning curves for classification of handwritten digits """ import matplotlib.pyplot as plt -import numpy +import numpy as np from sklearn.datasets import * from sklearn.cross_validation import train_test_split from sklearn.linear_model import LogisticRegression data = load_digits() print data.DESCR -num_trials = 10 + +X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=0.5) +model = LogisticRegression(C=10**-10) +model.fit(X_train, y_train) +print "Train accuracy %f" %model.score(X_train,y_train) +print "Test accuracy %f"%model.score(X_test,y_test) + +# fig = plt.figure() +# for i in range(10): +# subplot = fig.add_subplot(5,2,i+1) +# subplot.matshow(np.reshape(data.data[i],(8,8)), cmap='gray') +# plt.show() + +num_trials = 50 #200 is super accurate +index = 0 + train_percentages = range(5,95,5) -test_accuracies = numpy.zeros(len(train_percentages)) +test_accuracies = [] + +X_train_total = {} +X_test_total = {} +y_train_total = {} +y_test_total = {} + + +#loop through different c values, if you want. +for c in [0]:#ange(0, -20, -5): + fig = plt.figure() + Cval = 10**c + for i in range(num_trials): + for p in train_percentages: + results = train_test_split(data.data, data.target, train_size=p/100.0) + if i == 0: + + X_train_total[p] = results[0] + X_test_total[p] = results[1] + y_train_total[p] = results[2] + y_test_total[p] = results[3] + else: + X_train_total[p] = np.add(X_train_total.get(p, 0), results[0]) + X_test_total[p] = np.add(X_test_total.get(p, 0), results[1]) + y_train_total[p] = np.add(y_train_total.get(p, 0), results[2]) + y_test_total[p] = np.add(y_test_total.get(p, 0), results[3]) -# train a model with training percentages between 5 and 90 (see train_percentages) and evaluate -# the resultant accuracy. -# You should repeat each training percentage num_trials times to smooth out variability -# for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner + + model = LogisticRegression(C=Cval) + model.fit(results[0], results[2]) + test_accuracies.append(model.score(results[1],results[3])) -# TODO: your code here + plt.plot(train_percentages, test_accuracies) + test_accuracies=[] -fig = plt.figure() -plt.plot(train_percentages, test_accuracies) + # print '***** {}% *****'.format(p) + # print "Train accuracy %f" %model.score(X_train,y_train) + # print "Test accuracy %f"%model.score(X_test,y_test) + +#plt.plot(train_percentages, test_accuracies) +plt.xlabel('Percentage of Data Used for Training') +plt.ylabel('Accuracy on Test Set') + +fig2 = plt.figure() +test_accuracies_average = [] +print len(X_train_total) +print X_train_total[5] +for p in train_percentages: + X_train = X_train_total[p]/num_trials + X_test = X_test_total[p]/num_trials + y_train = y_train_total[p]/num_trials + y_test = y_test_total[p]/num_trials + + model = LogisticRegression(C=1.0) + model.fit(X_train, y_train) + test_accuracies_average.append(model.score(X_test,y_test)) +plt.plot(train_percentages, test_accuracies_average) plt.xlabel('Percentage of Data Used for Training') plt.ylabel('Accuracy on Test Set') plt.show() + diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..f350552 --- /dev/null +++ b/questions.txt @@ -0,0 +1,15 @@ +1. What is the general trend in the curve? + +It seems inverse, and the bigger percent you use for training, the more accurate the results are. + +2. Are there parts of the curve that appear to be noisier than others? Why? + +The first 1/2 (the middle part especially) seems way noisier than the end of the curve, where most of the set is used for training. + +3. How many trials do you need to get a smooth curve? + +Well, I couldn't get a smooth curve, but by around 20 trials, you can see a clear trend in the graph that plots every trial. + +4. Try different values for C (by changing LogisticRegression(C=10**-10)). What happens? If you want to know why this happens, see this Wikipedia page as well as the documentation for LogisticRegression in scikit-learn. + +The lower the value, the noisier the data is. A C value of 1 generates a pretty clean line, whereas a C value of 10^-20 shows wild variation in the trials. \ No newline at end of file