diff --git a/Answer.txt b/Answer.txt new file mode 100644 index 0000000..851b8fd --- /dev/null +++ b/Answer.txt @@ -0,0 +1,11 @@ +1. +The Accuracy on Test Set increases as Percentage of Data Used for Training increases, but the slope keeps decreasing slightly. +2. +When percentage used of data is relatively small, the curve is noisier. Standard deviation is relatively high when the dataset is relatively small. +3. +I set num_trails as 1000 and it looks smooth, but still need more to make it perfectly smooth +4. +As C gets bigger, the bending of learning curve becomes more obvious. C is the 'inverse of regularization strength', so bigger C means weaker regularization, which make the graph smoother. + + + diff --git a/learning_curve.py b/learning_curve.py index 2364f2c..90bd6bf 100644 --- a/learning_curve.py +++ b/learning_curve.py @@ -8,16 +8,21 @@ data = load_digits() print data.DESCR -num_trials = 10 +num_trials = 1000 train_percentages = range(5,95,5) test_accuracies = numpy.zeros(len(train_percentages)) -# train a model with training percentages between 5 and 90 (see train_percentages) and evaluate -# the resultant accuracy. -# You should repeat each training percentage num_trials times to smooth out variability -# for consistency with the previous example use model = LogisticRegression(C=10**-10) for your learner -# TODO: your code here +curve_y = [] +model = LogisticRegression(C=10**-10) +for i in train_percentages: + score = [] + for j in range(num_trials): + X_train, X_test, Y_train, Y_test = train_test_split(data.data, data.target, train_size=i) + model.fit(X_train, Y_train) + score.append(model.score(X_test, Y_test)) + test_accuracies[i/5-1] = sum(score)/num_trials + fig = plt.figure() plt.plot(train_percentages, test_accuracies)