diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..5c38488 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -17,10 +17,32 @@ # You should repeat each training percentage num_trials times to smooth out variability # for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner -# TODO: your code here +def train_model(train_percentage): + train_size = train_percentage / 100.0 + + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=train_size) + model = LogisticRegression(C=10**-14) + model.fit(X_train, y_train) + + return model.score(X_test,y_test) + +for index in range(len(train_percentages)): + print index + temp = [] + for i in range(50): + temp.append(train_model(train_percentages[index])) + test_accuracies[index] = numpy.mean(temp) + fig = plt.figure() plt.plot(train_percentages, test_accuracies) plt.xlabel('Percentage of Data Used for Training') plt.ylabel('Accuracy on Test Set') plt.show() + + +'''CODE GRAVEYARD''' +#inside train_model: + # print "Train accuracy %f" %model.score(X_train,y_train) + # print "Test accuracy %f"%model.score(X_test,y_test) + \ No newline at end of file diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..699b893 --- /dev/null +++ b/questions.txt @@ -0,0 +1,9 @@ +EFunkhouser questions.txt +3/13/2016 + +Machine Learning Toolbox + +1. The general trend of the curve is exponential approaching 1 (high slope at first, then tapers off as percentage of data used for training increases). +2. I'm not seeing a part of the curve that clearly has more noise than the rest. My guess at what part I would EXPECT to see more noise in is the beginning of the curve, since it seems like if you have a small training dataset it could be hit-or-miss as to whether the stuff you train with is a good representation of the 'average' handwriting (what you're most likely to see in the testing dataset). +3. 1000 trials got me a smooth curve. +4. Making C much larger (10^-6) got me a 'curvier' curve, with a more pronounced change in slope from beginning to end. Making C much smaller (10^-14) flattened the curve significantly, especially in the middle portion.