diff --git a/1000_trials.png b/1000_trials.png new file mode 100644 index 0000000..52a419d Binary files /dev/null and b/1000_trials.png differ diff --git a/100_trials.png b/100_trials.png new file mode 100644 index 0000000..8d32156 Binary files /dev/null and b/100_trials.png differ diff --git a/10_trials.png b/10_trials.png new file mode 100644 index 0000000..13296d3 Binary files /dev/null and b/10_trials.png differ diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..9226bc5 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -1,26 +1,28 @@ """ Exploring learning curves for classification of handwritten digits """ import matplotlib.pyplot as plt -import numpy from sklearn.datasets import * from sklearn.cross_validation import train_test_split from sklearn.linear_model import LogisticRegression data = load_digits() -print data.DESCR num_trials = 10 -train_percentages = range(5,95,5) -test_accuracies = numpy.zeros(len(train_percentages)) +train_percentages = map(lambda x: x/100.0, range(5,95,5)) -# train a model with training percentages between 5 and 90 (see train_percentages) and evaluate -# the resultant accuracy. -# You should repeat each training percentage num_trials times to smooth out variability -# for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner -# TODO: your code here +def train(percent, trials): + scores = [] + model = LogisticRegression(C=10**-10) + for i in range(trials): + X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, train_size=(percent)) #Split the data into test and training + model.fit(X_train, y_train) #Train the model on the test set + scores.append(model.score(X_test,y_test)) #store the results + return sum(scores)/float(trials) #return the average result + +results = [train(percent, num_trials) for percent in train_percentages] #list comprehension to iterate through percentages fig = plt.figure() -plt.plot(train_percentages, test_accuracies) +plt.plot(train_percentages, results) plt.xlabel('Percentage of Data Used for Training') plt.ylabel('Accuracy on Test Set') plt.show() diff --git a/questions.txt b/questions.txt new file mode 100644 index 0000000..4adcc6e --- /dev/null +++ b/questions.txt @@ -0,0 +1,11 @@ +1. The general trend in the curve is that as you use a larger percentage of your data for training, you get better results in testing +This is because it has more time to 'learn' and it can generate more accurate factors for the test. +The curve could reasonable be approximated to a line, but it actually looks much more like a log + +2. Some parts of the curve are noisier because the randomness for how it's broken up matters more for smaller values +For really small training sets you can get a set that is highly representative of handwriting or one that is much less representative + +3. You my curve at 10 was distinctly not smooth, at 100 it was roughly not smooth, at 1000 it was very smooth +somewhere between 100 and 1000 you will get a smooth curve, but it's garunteed on the at 10^3 trials + +4.C is the how accurate you want the model to be. At higher values it becomes more precise and at lower values the model is rougher. \ No newline at end of file