From e8b1c8e1eadc3d29967393ea8ce0788d390befac Mon Sep 17 00:00:00 2001 From: Kalash Jindal <37014842+erickeagle@users.noreply.github.com> Date: Thu, 1 Oct 2020 15:13:38 +0530 Subject: [PATCH 1/2] Add files via upload --- Machine Learning/Breast_cancer_data.csv | 570 ++++++++++++ ... On Breast Cancer Prediction Dataset.ipynb | 871 ++++++++++++++++++ 2 files changed, 1441 insertions(+) create mode 100644 Machine Learning/Breast_cancer_data.csv create mode 100644 Machine Learning/Hyparameter Tuning- Grid search vs Bayesian optimization On Breast Cancer Prediction Dataset.ipynb diff --git a/Machine Learning/Breast_cancer_data.csv b/Machine Learning/Breast_cancer_data.csv new file mode 100644 index 00000000..8671a46c --- /dev/null +++ b/Machine Learning/Breast_cancer_data.csv @@ -0,0 +1,570 @@ +mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis +17.99,10.38,122.8,1001.0,0.1184,0 +20.57,17.77,132.9,1326.0,0.08474,0 +19.69,21.25,130.0,1203.0,0.1096,0 +11.42,20.38,77.58,386.1,0.1425,0 +20.29,14.34,135.1,1297.0,0.1003,0 +12.45,15.7,82.57,477.1,0.1278,0 +18.25,19.98,119.6,1040.0,0.09463,0 +13.71,20.83,90.2,577.9,0.1189,0 +13.0,21.82,87.5,519.8,0.1273,0 +12.46,24.04,83.97,475.9,0.1186,0 +16.02,23.24,102.7,797.8,0.08206,0 +15.78,17.89,103.6,781.0,0.0971,0 +19.17,24.8,132.4,1123.0,0.0974,0 +15.85,23.95,103.7,782.7,0.08401,0 +13.73,22.61,93.6,578.3,0.1131,0 +14.54,27.54,96.73,658.8,0.1139,0 +14.68,20.13,94.74,684.5,0.09867,0 +16.13,20.68,108.1,798.8,0.117,0 +19.81,22.15,130.0,1260.0,0.09831,0 +13.54,14.36,87.46,566.3,0.09779,1 +13.08,15.71,85.63,520.0,0.1075,1 +9.504,12.44,60.34,273.9,0.1024,1 +15.34,14.26,102.5,704.4,0.1073,0 +21.16,23.04,137.2,1404.0,0.09428,0 +16.65,21.38,110.0,904.6,0.1121,0 +17.14,16.4,116.0,912.7,0.1186,0 +14.58,21.53,97.41,644.8,0.1054,0 +18.61,20.25,122.1,1094.0,0.0944,0 +15.3,25.27,102.4,732.4,0.1082,0 +17.57,15.05,115.0,955.1,0.09847,0 +18.63,25.11,124.8,1088.0,0.1064,0 +11.84,18.7,77.93,440.6,0.1109,0 +17.02,23.98,112.8,899.3,0.1197,0 +19.27,26.47,127.9,1162.0,0.09401,0 +16.13,17.88,107.0,807.2,0.104,0 +16.74,21.59,110.1,869.5,0.0961,0 +14.25,21.72,93.63,633.0,0.09823,0 +13.03,18.42,82.61,523.8,0.08983,1 +14.99,25.2,95.54,698.8,0.09387,0 +13.48,20.82,88.4,559.2,0.1016,0 +13.44,21.58,86.18,563.0,0.08162,0 +10.95,21.35,71.9,371.1,0.1227,0 +19.07,24.81,128.3,1104.0,0.09081,0 +13.28,20.28,87.32,545.2,0.1041,0 +13.17,21.81,85.42,531.5,0.09714,0 +18.65,17.6,123.7,1076.0,0.1099,0 +8.196,16.84,51.71,201.9,0.086,1 +13.17,18.66,85.98,534.6,0.1158,0 +12.05,14.63,78.04,449.3,0.1031,1 +13.49,22.3,86.91,561.0,0.08752,1 +11.76,21.6,74.72,427.9,0.08637,1 +13.64,16.34,87.21,571.8,0.07685,1 +11.94,18.24,75.71,437.6,0.08261,1 +18.22,18.7,120.3,1033.0,0.1148,0 +15.1,22.02,97.26,712.8,0.09056,0 +11.52,18.75,73.34,409.0,0.09524,1 +19.21,18.57,125.5,1152.0,0.1053,0 +14.71,21.59,95.55,656.9,0.1137,0 +13.05,19.31,82.61,527.2,0.0806,1 +8.618,11.79,54.34,224.5,0.09752,1 +10.17,14.88,64.55,311.9,0.1134,1 +8.598,20.98,54.66,221.8,0.1243,1 +14.25,22.15,96.42,645.7,0.1049,0 +9.173,13.86,59.2,260.9,0.07721,1 +12.68,23.84,82.69,499.0,0.1122,0 +14.78,23.94,97.4,668.3,0.1172,0 +9.465,21.01,60.11,269.4,0.1044,1 +11.31,19.04,71.8,394.1,0.08139,1 +9.029,17.33,58.79,250.5,0.1066,1 +12.78,16.49,81.37,502.5,0.09831,1 +18.94,21.31,123.6,1130.0,0.09009,0 +8.888,14.64,58.79,244.0,0.09783,1 +17.2,24.52,114.2,929.4,0.1071,0 +13.8,15.79,90.43,584.1,0.1007,0 +12.31,16.52,79.19,470.9,0.09172,1 +16.07,19.65,104.1,817.7,0.09168,0 +13.53,10.94,87.91,559.2,0.1291,1 +18.05,16.15,120.2,1006.0,0.1065,0 +20.18,23.97,143.7,1245.0,0.1286,0 +12.86,18.0,83.19,506.3,0.09934,1 +11.45,20.97,73.81,401.5,0.1102,1 +13.34,15.86,86.49,520.0,0.1078,1 +25.22,24.91,171.5,1878.0,0.1063,0 +19.1,26.29,129.1,1132.0,0.1215,0 +12.0,15.65,76.95,443.3,0.09723,1 +18.46,18.52,121.1,1075.0,0.09874,0 +14.48,21.46,94.25,648.2,0.09444,0 +19.02,24.59,122.0,1076.0,0.09029,0 +12.36,21.8,79.78,466.1,0.08772,1 +14.64,15.24,95.77,651.9,0.1132,1 +14.62,24.02,94.57,662.7,0.08974,1 +15.37,22.76,100.2,728.2,0.092,0 +13.27,14.76,84.74,551.7,0.07355,1 +13.45,18.3,86.6,555.1,0.1022,1 +15.06,19.83,100.3,705.6,0.1039,0 +20.26,23.03,132.4,1264.0,0.09078,0 +12.18,17.84,77.79,451.1,0.1045,1 +9.787,19.94,62.11,294.5,0.1024,1 +11.6,12.84,74.34,412.6,0.08983,1 +14.42,19.77,94.48,642.5,0.09752,0 +13.61,24.98,88.05,582.7,0.09488,0 +6.981,13.43,43.79,143.5,0.117,1 +12.18,20.52,77.22,458.7,0.08013,1 +9.876,19.4,63.95,298.3,0.1005,1 +10.49,19.29,67.41,336.1,0.09989,1 +13.11,15.56,87.21,530.2,0.1398,0 +11.64,18.33,75.17,412.5,0.1142,1 +12.36,18.54,79.01,466.7,0.08477,1 +22.27,19.67,152.8,1509.0,0.1326,0 +11.34,21.26,72.48,396.5,0.08759,1 +9.777,16.99,62.5,290.2,0.1037,1 +12.63,20.76,82.15,480.4,0.09933,1 +14.26,19.65,97.83,629.9,0.07837,1 +10.51,20.19,68.64,334.2,0.1122,1 +8.726,15.83,55.84,230.9,0.115,1 +11.93,21.53,76.53,438.6,0.09768,1 +8.95,15.76,58.74,245.2,0.09462,1 +14.87,16.67,98.64,682.5,0.1162,0 +15.78,22.91,105.7,782.6,0.1155,0 +17.95,20.01,114.2,982.0,0.08402,0 +11.41,10.82,73.34,403.3,0.09373,1 +18.66,17.12,121.4,1077.0,0.1054,0 +24.25,20.2,166.2,1761.0,0.1447,0 +14.5,10.89,94.28,640.7,0.1101,1 +13.37,16.39,86.1,553.5,0.07115,1 +13.85,17.21,88.44,588.7,0.08785,1 +13.61,24.69,87.76,572.6,0.09258,0 +19.0,18.91,123.4,1138.0,0.08217,0 +15.1,16.39,99.58,674.5,0.115,1 +19.79,25.12,130.4,1192.0,0.1015,0 +12.19,13.29,79.08,455.8,0.1066,1 +15.46,19.48,101.7,748.9,0.1092,0 +16.16,21.54,106.2,809.8,0.1008,0 +15.71,13.93,102.0,761.7,0.09462,1 +18.45,21.91,120.2,1075.0,0.0943,0 +12.77,22.47,81.72,506.3,0.09055,0 +11.71,16.67,74.72,423.6,0.1051,1 +11.43,15.39,73.06,399.8,0.09639,1 +14.95,17.57,96.85,678.1,0.1167,0 +11.28,13.39,73.0,384.8,0.1164,1 +9.738,11.97,61.24,288.5,0.0925,1 +16.11,18.05,105.1,813.0,0.09721,0 +11.43,17.31,73.66,398.0,0.1092,1 +12.9,15.92,83.74,512.2,0.08677,1 +10.75,14.97,68.26,355.3,0.07793,1 +11.9,14.65,78.11,432.8,0.1152,1 +11.8,16.58,78.99,432.0,0.1091,0 +14.95,18.77,97.84,689.5,0.08138,1 +14.44,15.18,93.97,640.1,0.0997,1 +13.74,17.91,88.12,585.0,0.07944,1 +13.0,20.78,83.51,519.4,0.1135,1 +8.219,20.7,53.27,203.9,0.09405,1 +9.731,15.34,63.78,300.2,0.1072,1 +11.15,13.08,70.87,381.9,0.09754,1 +13.15,15.34,85.31,538.9,0.09384,1 +12.25,17.94,78.27,460.3,0.08654,1 +17.68,20.74,117.4,963.7,0.1115,0 +16.84,19.46,108.4,880.2,0.07445,1 +12.06,12.74,76.84,448.6,0.09311,1 +10.9,12.96,68.69,366.8,0.07515,1 +11.75,20.18,76.1,419.8,0.1089,1 +19.19,15.94,126.3,1157.0,0.08694,0 +19.59,18.15,130.7,1214.0,0.112,0 +12.34,22.22,79.85,464.5,0.1012,1 +23.27,22.04,152.1,1686.0,0.08439,0 +14.97,19.76,95.5,690.2,0.08421,1 +10.8,9.71,68.77,357.6,0.09594,1 +16.78,18.8,109.3,886.3,0.08865,0 +17.47,24.68,116.1,984.6,0.1049,0 +14.97,16.95,96.22,685.9,0.09855,1 +12.32,12.39,78.85,464.1,0.1028,1 +13.43,19.63,85.84,565.4,0.09048,0 +15.46,11.89,102.5,736.9,0.1257,0 +11.08,14.71,70.21,372.7,0.1006,1 +10.66,15.15,67.49,349.6,0.08792,1 +8.671,14.45,54.42,227.2,0.09138,1 +9.904,18.06,64.6,302.4,0.09699,1 +16.46,20.11,109.3,832.9,0.09831,0 +13.01,22.22,82.01,526.4,0.06251,1 +12.81,13.06,81.29,508.8,0.08739,1 +27.22,21.87,182.1,2250.0,0.1094,0 +21.09,26.57,142.7,1311.0,0.1141,0 +15.7,20.31,101.2,766.6,0.09597,0 +11.41,14.92,73.53,402.0,0.09059,1 +15.28,22.41,98.92,710.6,0.09057,0 +10.08,15.11,63.76,317.5,0.09267,1 +18.31,18.58,118.6,1041.0,0.08588,0 +11.71,17.19,74.68,420.3,0.09774,1 +11.81,17.39,75.27,428.9,0.1007,1 +12.3,15.9,78.83,463.7,0.0808,1 +14.22,23.12,94.37,609.9,0.1075,0 +12.77,21.41,82.02,507.4,0.08749,1 +9.72,18.22,60.73,288.1,0.0695,1 +12.34,26.86,81.15,477.4,0.1034,0 +14.86,23.21,100.4,671.4,0.1044,0 +12.91,16.33,82.53,516.4,0.07941,1 +13.77,22.29,90.63,588.9,0.12,0 +18.08,21.84,117.4,1024.0,0.07371,0 +19.18,22.49,127.5,1148.0,0.08523,0 +14.45,20.22,94.49,642.7,0.09872,0 +12.23,19.56,78.54,461.0,0.09586,1 +17.54,19.32,115.1,951.6,0.08968,0 +23.29,26.67,158.9,1685.0,0.1141,0 +13.81,23.75,91.56,597.8,0.1323,0 +12.47,18.6,81.09,481.9,0.09965,1 +15.12,16.68,98.78,716.6,0.08876,0 +9.876,17.27,62.92,295.4,0.1089,1 +17.01,20.26,109.7,904.3,0.08772,0 +13.11,22.54,87.02,529.4,0.1002,1 +15.27,12.91,98.17,725.5,0.08182,1 +20.58,22.14,134.7,1290.0,0.0909,0 +11.84,18.94,75.51,428.0,0.08871,1 +28.11,18.47,188.5,2499.0,0.1142,0 +17.42,25.56,114.5,948.0,0.1006,0 +14.19,23.81,92.87,610.7,0.09463,0 +13.86,16.93,90.96,578.9,0.1026,0 +11.89,18.35,77.32,432.2,0.09363,1 +10.2,17.48,65.05,321.2,0.08054,1 +19.8,21.56,129.7,1230.0,0.09383,0 +19.53,32.47,128.0,1223.0,0.0842,0 +13.65,13.16,87.88,568.9,0.09646,1 +13.56,13.9,88.59,561.3,0.1051,1 +10.18,17.53,65.12,313.1,0.1061,1 +15.75,20.25,102.6,761.3,0.1025,0 +13.27,17.02,84.55,546.4,0.08445,1 +14.34,13.47,92.51,641.2,0.09906,1 +10.44,15.46,66.62,329.6,0.1053,1 +15.0,15.51,97.45,684.5,0.08371,1 +12.62,23.97,81.35,496.4,0.07903,1 +12.83,22.33,85.26,503.2,0.1088,0 +17.05,19.08,113.4,895.0,0.1141,0 +11.32,27.08,71.76,395.7,0.06883,1 +11.22,33.81,70.79,386.8,0.0778,1 +20.51,27.81,134.4,1319.0,0.09159,0 +9.567,15.91,60.21,279.6,0.08464,1 +14.03,21.25,89.79,603.4,0.0907,1 +23.21,26.97,153.5,1670.0,0.09509,0 +20.48,21.46,132.5,1306.0,0.08355,0 +14.22,27.85,92.55,623.9,0.08223,1 +17.46,39.28,113.4,920.6,0.09812,0 +13.64,15.6,87.38,575.3,0.09423,1 +12.42,15.04,78.61,476.5,0.07926,1 +11.3,18.19,73.93,389.4,0.09592,1 +13.75,23.77,88.54,590.0,0.08043,1 +19.4,23.5,129.1,1155.0,0.1027,0 +10.48,19.86,66.72,337.7,0.107,1 +13.2,17.43,84.13,541.6,0.07215,1 +12.89,14.11,84.95,512.2,0.0876,1 +10.65,25.22,68.01,347.0,0.09657,1 +11.52,14.93,73.87,406.3,0.1013,1 +20.94,23.56,138.9,1364.0,0.1007,0 +11.5,18.45,73.28,407.4,0.09345,1 +19.73,19.82,130.7,1206.0,0.1062,0 +17.3,17.08,113.0,928.2,0.1008,0 +19.45,19.33,126.5,1169.0,0.1035,0 +13.96,17.05,91.43,602.4,0.1096,0 +19.55,28.77,133.6,1207.0,0.0926,0 +15.32,17.27,103.2,713.3,0.1335,0 +15.66,23.2,110.2,773.5,0.1109,0 +15.53,33.56,103.7,744.9,0.1063,0 +20.31,27.06,132.9,1288.0,0.1,0 +17.35,23.06,111.0,933.1,0.08662,0 +17.29,22.13,114.4,947.8,0.08999,0 +15.61,19.38,100.0,758.6,0.0784,0 +17.19,22.07,111.6,928.3,0.09726,0 +20.73,31.12,135.7,1419.0,0.09469,0 +10.6,18.95,69.28,346.4,0.09688,1 +13.59,21.84,87.16,561.0,0.07956,1 +12.87,16.21,82.38,512.2,0.09425,1 +10.71,20.39,69.5,344.9,0.1082,1 +14.29,16.82,90.3,632.6,0.06429,1 +11.29,13.04,72.23,388.0,0.09834,1 +21.75,20.99,147.3,1491.0,0.09401,0 +9.742,15.67,61.5,289.9,0.09037,1 +17.93,24.48,115.2,998.9,0.08855,0 +11.89,17.36,76.2,435.6,0.1225,1 +11.33,14.16,71.79,396.6,0.09379,1 +18.81,19.98,120.9,1102.0,0.08923,0 +13.59,17.84,86.24,572.3,0.07948,1 +13.85,15.18,88.99,587.4,0.09516,1 +19.16,26.6,126.2,1138.0,0.102,0 +11.74,14.02,74.24,427.3,0.07813,1 +19.4,18.18,127.2,1145.0,0.1037,0 +16.24,18.77,108.8,805.1,0.1066,0 +12.89,15.7,84.08,516.6,0.07818,1 +12.58,18.4,79.83,489.0,0.08393,1 +11.94,20.76,77.87,441.0,0.08605,1 +12.89,13.12,81.89,515.9,0.06955,1 +11.26,19.96,73.72,394.1,0.0802,1 +11.37,18.89,72.17,396.0,0.08713,1 +14.41,19.73,96.03,651.0,0.08757,1 +14.96,19.1,97.03,687.3,0.08992,1 +12.95,16.02,83.14,513.7,0.1005,1 +11.85,17.46,75.54,432.7,0.08372,1 +12.72,13.78,81.78,492.1,0.09667,1 +13.77,13.27,88.06,582.7,0.09198,1 +10.91,12.35,69.14,363.7,0.08518,1 +11.76,18.14,75.0,431.1,0.09968,0 +14.26,18.17,91.22,633.1,0.06576,1 +10.51,23.09,66.85,334.2,0.1015,1 +19.53,18.9,129.5,1217.0,0.115,0 +12.46,19.89,80.43,471.3,0.08451,1 +20.09,23.86,134.7,1247.0,0.108,0 +10.49,18.61,66.86,334.3,0.1068,1 +11.46,18.16,73.59,403.1,0.08853,1 +11.6,24.49,74.23,417.2,0.07474,1 +13.2,15.82,84.07,537.3,0.08511,1 +9.0,14.4,56.36,246.3,0.07005,1 +13.5,12.71,85.69,566.2,0.07376,1 +13.05,13.84,82.71,530.6,0.08352,1 +11.7,19.11,74.33,418.7,0.08814,1 +14.61,15.69,92.68,664.9,0.07618,1 +12.76,13.37,82.29,504.1,0.08794,1 +11.54,10.72,73.73,409.1,0.08597,1 +8.597,18.6,54.09,221.2,0.1074,1 +12.49,16.85,79.19,481.6,0.08511,1 +12.18,14.08,77.25,461.4,0.07734,1 +18.22,18.87,118.7,1027.0,0.09746,0 +9.042,18.9,60.07,244.5,0.09968,1 +12.43,17.0,78.6,477.3,0.07557,1 +10.25,16.18,66.52,324.2,0.1061,1 +20.16,19.66,131.1,1274.0,0.0802,0 +12.86,13.32,82.82,504.8,0.1134,1 +20.34,21.51,135.9,1264.0,0.117,0 +12.2,15.21,78.01,457.9,0.08673,1 +12.67,17.3,81.25,489.9,0.1028,1 +14.11,12.88,90.03,616.5,0.09309,1 +12.03,17.93,76.09,446.0,0.07683,1 +16.27,20.71,106.9,813.7,0.1169,0 +16.26,21.88,107.5,826.8,0.1165,0 +16.03,15.51,105.8,793.2,0.09491,0 +12.98,19.35,84.52,514.0,0.09579,1 +11.22,19.86,71.94,387.3,0.1054,1 +11.25,14.78,71.38,390.0,0.08306,1 +12.3,19.02,77.88,464.4,0.08313,1 +17.06,21.0,111.8,918.6,0.1119,0 +12.99,14.23,84.08,514.3,0.09462,1 +18.77,21.43,122.9,1092.0,0.09116,0 +10.05,17.53,64.41,310.8,0.1007,1 +23.51,24.27,155.1,1747.0,0.1069,0 +14.42,16.54,94.15,641.2,0.09751,1 +9.606,16.84,61.64,280.5,0.08481,1 +11.06,14.96,71.49,373.9,0.1033,1 +19.68,21.68,129.9,1194.0,0.09797,0 +11.71,15.45,75.03,420.3,0.115,1 +10.26,14.71,66.2,321.6,0.09882,1 +12.06,18.9,76.66,445.3,0.08386,1 +14.76,14.74,94.87,668.7,0.08875,1 +11.47,16.03,73.02,402.7,0.09076,1 +11.95,14.96,77.23,426.7,0.1158,1 +11.66,17.07,73.7,421.0,0.07561,1 +15.75,19.22,107.1,758.6,0.1243,0 +25.73,17.46,174.2,2010.0,0.1149,0 +15.08,25.74,98.0,716.6,0.1024,0 +11.14,14.07,71.24,384.6,0.07274,1 +12.56,19.07,81.92,485.8,0.0876,1 +13.05,18.59,85.09,512.0,0.1082,1 +13.87,16.21,88.52,593.7,0.08743,1 +8.878,15.49,56.74,241.0,0.08293,1 +9.436,18.32,59.82,278.6,0.1009,1 +12.54,18.07,79.42,491.9,0.07436,1 +13.3,21.57,85.24,546.1,0.08582,1 +12.76,18.84,81.87,496.6,0.09676,1 +16.5,18.29,106.6,838.1,0.09686,1 +13.4,16.95,85.48,552.4,0.07937,1 +20.44,21.78,133.8,1293.0,0.0915,0 +20.2,26.83,133.7,1234.0,0.09905,0 +12.21,18.02,78.31,458.4,0.09231,1 +21.71,17.25,140.9,1546.0,0.09384,0 +22.01,21.9,147.2,1482.0,0.1063,0 +16.35,23.29,109.0,840.4,0.09742,0 +15.19,13.21,97.65,711.8,0.07963,1 +21.37,15.1,141.3,1386.0,0.1001,0 +20.64,17.35,134.8,1335.0,0.09446,0 +13.69,16.07,87.84,579.1,0.08302,1 +16.17,16.07,106.3,788.5,0.0988,1 +10.57,20.22,70.15,338.3,0.09073,1 +13.46,28.21,85.89,562.1,0.07517,1 +13.66,15.15,88.27,580.6,0.08268,1 +11.08,18.83,73.3,361.6,0.1216,0 +11.27,12.96,73.16,386.3,0.1237,1 +11.04,14.93,70.67,372.7,0.07987,1 +12.05,22.72,78.75,447.8,0.06935,1 +12.39,17.48,80.64,462.9,0.1042,1 +13.28,13.72,85.79,541.8,0.08363,1 +14.6,23.29,93.97,664.7,0.08682,0 +12.21,14.09,78.78,462.0,0.08108,1 +13.88,16.16,88.37,596.6,0.07026,1 +11.27,15.5,73.38,392.0,0.08365,1 +19.55,23.21,128.9,1174.0,0.101,0 +10.26,12.22,65.75,321.6,0.09996,1 +8.734,16.84,55.27,234.3,0.1039,1 +15.49,19.97,102.4,744.7,0.116,0 +21.61,22.28,144.4,1407.0,0.1167,0 +12.1,17.72,78.07,446.2,0.1029,1 +14.06,17.18,89.75,609.1,0.08045,1 +13.51,18.89,88.1,558.1,0.1059,1 +12.8,17.46,83.05,508.3,0.08044,1 +11.06,14.83,70.31,378.2,0.07741,1 +11.8,17.26,75.26,431.9,0.09087,1 +17.91,21.02,124.4,994.0,0.123,0 +11.93,10.91,76.14,442.7,0.08872,1 +12.96,18.29,84.18,525.2,0.07351,1 +12.94,16.17,83.18,507.6,0.09879,1 +12.34,14.95,78.29,469.1,0.08682,1 +10.94,18.59,70.39,370.0,0.1004,1 +16.14,14.86,104.3,800.0,0.09495,1 +12.85,21.37,82.63,514.5,0.07551,1 +17.99,20.66,117.8,991.7,0.1036,0 +12.27,17.92,78.41,466.1,0.08685,1 +11.36,17.57,72.49,399.8,0.08858,1 +11.04,16.83,70.92,373.2,0.1077,1 +9.397,21.68,59.75,268.8,0.07969,1 +14.99,22.11,97.53,693.7,0.08515,1 +15.13,29.81,96.71,719.5,0.0832,0 +11.89,21.17,76.39,433.8,0.09773,1 +9.405,21.7,59.6,271.2,0.1044,1 +15.5,21.08,102.9,803.1,0.112,0 +12.7,12.17,80.88,495.0,0.08785,1 +11.16,21.41,70.95,380.3,0.1018,1 +11.57,19.04,74.2,409.7,0.08546,1 +14.69,13.98,98.22,656.1,0.1031,1 +11.61,16.02,75.46,408.2,0.1088,1 +13.66,19.13,89.46,575.3,0.09057,1 +9.742,19.12,61.93,289.7,0.1075,1 +10.03,21.28,63.19,307.3,0.08117,1 +10.48,14.98,67.49,333.6,0.09816,1 +10.8,21.98,68.79,359.9,0.08801,1 +11.13,16.62,70.47,381.1,0.08151,1 +12.72,17.67,80.98,501.3,0.07896,1 +14.9,22.53,102.1,685.0,0.09947,0 +12.4,17.68,81.47,467.8,0.1054,1 +20.18,19.54,133.8,1250.0,0.1133,0 +18.82,21.97,123.7,1110.0,0.1018,0 +14.86,16.94,94.89,673.7,0.08924,1 +13.98,19.62,91.12,599.5,0.106,0 +12.87,19.54,82.67,509.2,0.09136,1 +14.04,15.98,89.78,611.2,0.08458,1 +13.85,19.6,88.68,592.6,0.08684,1 +14.02,15.66,89.59,606.5,0.07966,1 +10.97,17.2,71.73,371.5,0.08915,1 +17.27,25.42,112.4,928.8,0.08331,0 +13.78,15.79,88.37,585.9,0.08817,1 +10.57,18.32,66.82,340.9,0.08142,1 +18.03,16.85,117.5,990.0,0.08947,0 +11.99,24.89,77.61,441.3,0.103,1 +17.75,28.03,117.3,981.6,0.09997,0 +14.8,17.66,95.88,674.8,0.09179,1 +14.53,19.34,94.25,659.7,0.08388,1 +21.1,20.52,138.1,1384.0,0.09684,0 +11.87,21.54,76.83,432.0,0.06613,1 +19.59,25.0,127.7,1191.0,0.1032,0 +12.0,28.23,76.77,442.5,0.08437,1 +14.53,13.98,93.86,644.2,0.1099,1 +12.62,17.15,80.62,492.9,0.08583,1 +13.38,30.72,86.34,557.2,0.09245,1 +11.63,29.29,74.87,415.1,0.09357,1 +13.21,25.25,84.1,537.9,0.08791,1 +13.0,25.13,82.61,520.2,0.08369,1 +9.755,28.2,61.68,290.9,0.07984,1 +17.08,27.15,111.2,930.9,0.09898,0 +27.42,26.27,186.9,2501.0,0.1084,0 +14.4,26.99,92.25,646.1,0.06995,1 +11.6,18.36,73.88,412.7,0.08508,1 +13.17,18.22,84.28,537.3,0.07466,1 +13.24,20.13,86.87,542.9,0.08284,1 +13.14,20.74,85.98,536.9,0.08675,1 +9.668,18.1,61.06,286.3,0.08311,1 +17.6,23.33,119.0,980.5,0.09289,0 +11.62,18.18,76.38,408.8,0.1175,1 +9.667,18.49,61.49,289.1,0.08946,1 +12.04,28.14,76.85,449.9,0.08752,1 +14.92,14.93,96.45,686.9,0.08098,1 +12.27,29.97,77.42,465.4,0.07699,1 +10.88,15.62,70.41,358.9,0.1007,1 +12.83,15.73,82.89,506.9,0.0904,1 +14.2,20.53,92.41,618.4,0.08931,1 +13.9,16.62,88.97,599.4,0.06828,1 +11.49,14.59,73.99,404.9,0.1046,1 +16.25,19.51,109.8,815.8,0.1026,0 +12.16,18.03,78.29,455.3,0.09087,1 +13.9,19.24,88.73,602.9,0.07991,1 +13.47,14.06,87.32,546.3,0.1071,1 +13.7,17.64,87.76,571.1,0.0995,1 +15.73,11.28,102.8,747.2,0.1043,1 +12.45,16.41,82.85,476.7,0.09514,1 +14.64,16.85,94.21,666.0,0.08641,1 +19.44,18.82,128.1,1167.0,0.1089,0 +11.68,16.17,75.49,420.5,0.1128,1 +16.69,20.2,107.1,857.6,0.07497,0 +12.25,22.44,78.18,466.5,0.08192,1 +17.85,13.23,114.6,992.1,0.07838,1 +18.01,20.56,118.4,1007.0,0.1001,0 +12.46,12.83,78.83,477.3,0.07372,1 +13.16,20.54,84.06,538.7,0.07335,1 +14.87,20.21,96.12,680.9,0.09587,1 +12.65,18.17,82.69,485.6,0.1076,1 +12.47,17.31,80.45,480.1,0.08928,1 +18.49,17.52,121.3,1068.0,0.1012,0 +20.59,21.24,137.8,1320.0,0.1085,0 +15.04,16.74,98.73,689.4,0.09883,1 +13.82,24.49,92.33,595.9,0.1162,0 +12.54,16.32,81.25,476.3,0.1158,1 +23.09,19.83,152.1,1682.0,0.09342,0 +9.268,12.87,61.49,248.7,0.1634,1 +9.676,13.14,64.12,272.5,0.1255,1 +12.22,20.04,79.47,453.1,0.1096,1 +11.06,17.12,71.25,366.5,0.1194,1 +16.3,15.7,104.7,819.8,0.09427,1 +15.46,23.95,103.8,731.3,0.1183,0 +11.74,14.69,76.31,426.0,0.08099,1 +14.81,14.7,94.66,680.7,0.08472,1 +13.4,20.52,88.64,556.7,0.1106,0 +14.58,13.66,94.29,658.8,0.09832,1 +15.05,19.07,97.26,701.9,0.09215,0 +11.34,18.61,72.76,391.2,0.1049,1 +18.31,20.58,120.8,1052.0,0.1068,0 +19.89,20.26,130.5,1214.0,0.1037,0 +12.88,18.22,84.45,493.1,0.1218,1 +12.75,16.7,82.51,493.8,0.1125,1 +9.295,13.9,59.96,257.8,0.1371,1 +24.63,21.6,165.5,1841.0,0.103,0 +11.26,19.83,71.3,388.1,0.08511,1 +13.71,18.68,88.73,571.0,0.09916,1 +9.847,15.68,63.0,293.2,0.09492,1 +8.571,13.1,54.53,221.3,0.1036,1 +13.46,18.75,87.44,551.1,0.1075,1 +12.34,12.27,78.94,468.5,0.09003,1 +13.94,13.17,90.31,594.2,0.1248,1 +12.07,13.44,77.83,445.2,0.11,1 +11.75,17.56,75.89,422.9,0.1073,1 +11.67,20.02,75.21,416.2,0.1016,1 +13.68,16.33,87.76,575.5,0.09277,1 +20.47,20.67,134.7,1299.0,0.09156,0 +10.96,17.62,70.79,365.6,0.09687,1 +20.55,20.86,137.8,1308.0,0.1046,0 +14.27,22.55,93.77,629.8,0.1038,0 +11.69,24.44,76.37,406.4,0.1236,1 +7.729,25.49,47.98,178.8,0.08098,1 +7.691,25.44,48.34,170.4,0.08668,1 +11.54,14.44,74.65,402.9,0.09984,1 +14.47,24.99,95.81,656.4,0.08837,1 +14.74,25.42,94.7,668.6,0.08275,1 +13.21,28.06,84.88,538.4,0.08671,1 +13.87,20.7,89.77,584.8,0.09578,1 +13.62,23.23,87.19,573.2,0.09246,1 +10.32,16.35,65.31,324.9,0.09434,1 +10.26,16.58,65.85,320.8,0.08877,1 +9.683,19.34,61.05,285.7,0.08491,1 +10.82,24.21,68.89,361.6,0.08192,1 +10.86,21.48,68.51,360.5,0.07431,1 +11.13,22.44,71.49,378.4,0.09566,1 +12.77,29.43,81.35,507.9,0.08276,1 +9.333,21.94,59.01,264.0,0.0924,1 +12.88,28.92,82.5,514.3,0.08123,1 +10.29,27.61,65.67,321.4,0.0903,1 +10.16,19.59,64.73,311.7,0.1003,1 +9.423,27.88,59.26,271.3,0.08123,1 +14.59,22.68,96.39,657.1,0.08473,1 +11.51,23.93,74.52,403.5,0.09261,1 +14.05,27.15,91.38,600.4,0.09929,1 +11.2,29.37,70.67,386.0,0.07449,1 +15.22,30.62,103.4,716.9,0.1048,0 +20.92,25.09,143.0,1347.0,0.1099,0 +21.56,22.39,142.0,1479.0,0.111,0 +20.13,28.25,131.2,1261.0,0.0978,0 +16.6,28.08,108.3,858.1,0.08455,0 +20.6,29.33,140.1,1265.0,0.1178,0 +7.76,24.54,47.92,181.0,0.05263,1 diff --git a/Machine Learning/Hyparameter Tuning- Grid search vs Bayesian optimization On Breast Cancer Prediction Dataset.ipynb b/Machine Learning/Hyparameter Tuning- Grid search vs Bayesian optimization On Breast Cancer Prediction Dataset.ipynb new file mode 100644 index 00000000..c8880b4d --- /dev/null +++ b/Machine Learning/Hyparameter Tuning- Grid search vs Bayesian optimization On Breast Cancer Prediction Dataset.ipynb @@ -0,0 +1,871 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hyparameter Tuning- Grid search vs Bayesian optimization On Breast Cancer Prediction Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In machine learning, hyperparameter optimization or tuning is the problem of choosing a set of optimal hyperparameters for a learning algorithm. A hyperparameter is a parameter whose value is used to control the learning process." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Grid Search is the process of scanning the data to configure optimal parameters for a given model. Depending on the type of model utilized, certain parameters are necessary. Grid-searching does NOT only apply to one model type. Grid-searching can be applied across machine learning to calculate the best parameters to use for any given model. It is important to note that Grid-searching can be extremely computationally expensive and may take your machine quite a long time to run. Grid-Search will build a model on each parameter combination possible. It iterates through every parameter combination and stores a model for each combination." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Bayesian Optimization provides a technique based on Bayes Theorem to direct a search of a global optimization problem that is efficient and effective. It works by building a probabilistic model of the objective function, called the surrogate function, that is then searched efficiently with an acquisition function before candidate samples are chosen for evaluation on the real objective function.\n", + "Bayesian Optimization is often used in applied machine learning to tune the hyperparameters of a given well-performing model on a validation dataset.It is an approach that is most useful for objective functions that are complex, noisy, and/or expensive to evaluate.\n", + "\n", + "Bayes Theorem is an approach for calculating the conditional probability of an event:\n", + "
  • P(A|B) = P(B|A) * P(A) / P(B)
  • \n", + "We can simplify this calculation by removing the normalizing value of P(B) and describe the conditional probability as a proportional quantity. This is useful as we are not interested in calculating a specific conditional probability, but instead in optimizing a quantity.\n", + "
  • P(A|B) = P(B|A) * P(A)
  • \n", + "The conditional probability that we are calculating is referred to generally as the posterior probability, the reverse conditional probability is sometimes referred to as the likelihood, and the marginal probability is referred to as the prior probability, for example:\n", + "
  • posterior = likelihood * prior
  • \n", + "\n", + "This provides a framework that can be used to quantify the beliefs about an unknown objective function given samples from the domain and their evaluation via the objective function.\n", + "\n", + "We can devise specific samples (x1, x2, …, xn) and evaluate them using the objective function f(xi) that returns the cost or outcome for the sample xi. Samples and their outcome are collected sequentially and define our data D, e.g. D = {xi, f(xi), … xn, f(xn)} and is used to define the prior. The likelihood function is defined as the probability of observing the data given the function P(D | f). This likelihood function will change as more observations are collected.\n", + "
  • P(f|D) = P(D|f) * P(f)
  • \n", + "The posterior represents everything we know about the objective function. It is an approximation of the objective function and can be used to estimate the cost of different candidate samples that we may want to evaluate." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Surrogate Function: Bayesian approximation of the objective function that can be sampled efficiently.\n", + "The surrogate function gives us an estimate of the objective function, which can be used to direct future sampling. Sampling involves careful use of the posterior in a function known as the “acquisition” function, e.g. for acquiring more samples. We want to use our belief about the objective function to sample the area of the search space that is most likely to pay off, therefore the acquisition will optimize the conditional probability of locations in the search to generate the next sample." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Acquisition Function: Technique by which the posterior is used to select the next sample from the search space." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### So, lets implement both hyperparameter tuning method for the dataset that is available on the kaggle, the Breast Canceer Prediction\n", + "Link to the kaggle dataset https://www.kaggle.com/merishnasuwal/breast-cancer-prediction-datas" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loading the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    mean_radiusmean_texturemean_perimetermean_areamean_smoothnessdiagnosis
    017.9910.38122.801001.00.118400
    120.5717.77132.901326.00.084740
    219.6921.25130.001203.00.109600
    311.4220.3877.58386.10.142500
    420.2914.34135.101297.00.100300
    .....................
    56421.5622.39142.001479.00.111000
    56520.1328.25131.201261.00.097800
    56616.6028.08108.30858.10.084550
    56720.6029.33140.101265.00.117800
    5687.7624.5447.92181.00.052631
    \n", + "

    569 rows × 6 columns

    \n", + "
    " + ], + "text/plain": [ + " mean_radius mean_texture mean_perimeter mean_area mean_smoothness \\\n", + "0 17.99 10.38 122.80 1001.0 0.11840 \n", + "1 20.57 17.77 132.90 1326.0 0.08474 \n", + "2 19.69 21.25 130.00 1203.0 0.10960 \n", + "3 11.42 20.38 77.58 386.1 0.14250 \n", + "4 20.29 14.34 135.10 1297.0 0.10030 \n", + ".. ... ... ... ... ... \n", + "564 21.56 22.39 142.00 1479.0 0.11100 \n", + "565 20.13 28.25 131.20 1261.0 0.09780 \n", + "566 16.60 28.08 108.30 858.1 0.08455 \n", + "567 20.60 29.33 140.10 1265.0 0.11780 \n", + "568 7.76 24.54 47.92 181.0 0.05263 \n", + "\n", + " diagnosis \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + ".. ... \n", + "564 0 \n", + "565 0 \n", + "566 0 \n", + "567 0 \n", + "568 1 \n", + "\n", + "[569 rows x 6 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('Breast_cancer_data.csv')\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dividing the value in X & Y to make prediction and spliting the dataset for training and testing" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "X = df.drop('diagnosis', axis=1)\n", + "Y = df['diagnosis']" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Importing some sklearn metrices for calculating the accuracy, precision and recall score and form a function for to calculate all the metrices" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import precision_score\n", + "from sklearn.metrics import recall_score" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def summarize_classification(y_test, y_pred):\n", + " \n", + " acc = accuracy_score(y_test, y_pred, normalize=True)\n", + " num_acc = accuracy_score(y_test, y_pred, normalize=False)\n", + "\n", + " prec = precision_score(y_test, y_pred)\n", + " recall = recall_score(y_test, y_pred)\n", + " \n", + " print(\"Test data count: \",len(y_test))\n", + " print(\"accuracy_count : \" , num_acc)\n", + " print(\"accuracy_score : \" , acc)\n", + " print(\"precision_score : \" , prec)\n", + " print(\"recall_score : \", recall)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Importing the Grid search from the sklearn model selection and forming a variable parameter contining the max depth for fiting the decision tree with the best parameter suggested by the grid search" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.tree import DecisionTreeClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'max_depth': 7}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "parameters = {'max_depth': [1,2,3,4,5,6,7,8,9,10,11,12]}\n", + "\n", + "grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True)\n", + "grid_search.fit(x_train, y_train)\n", + "\n", + "grid_search.best_params_" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "decision_tree_model = DecisionTreeClassifier(max_depth = grid_search.best_params_['max_depth']).fit(x_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Summary of the calculation metrices achieve after predicting the values for x_test and then checking the accuracy by comparing the y_test and y_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test data count: 114\n", + "accuracy_count : 100\n", + "accuracy_score : 0.8771929824561403\n", + "precision_score : 0.9230769230769231\n", + "recall_score : 0.8695652173913043\n" + ] + } + ], + "source": [ + "y_pred = decision_tree_model.predict(x_test)\n", + "summarize_classification(y_test, y_pred)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Two popular libraries for Bayesian Optimization include\n", + "\n", + "
  • Scikit-Optimize
  • \n", + "
  • HyperOpt
  • \n", + "In machine learning, these libraries are often used to tune the hyperparameters of algorithms.\n", + "Hyperparameter tuning is a good fit for Bayesian Optimization because the evaluation function is computationally expensive (e.g. training models for each set of hyperparameters) and noisy (e.g. noise in training data and stochastic learning algorithms)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I have used Scikit-Optimize library to optimize the hyperparameters for this classification problem. The Scikit-Optimize project is designed to provide access to Bayesian Optimization for applications that use SciPy and NumPy, or applications that use scikit-learn machine learning algorithms." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### pip install scikit-optimize is used to install it" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### importing the important libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# example of bayesian optimization with scikit-optimize\n", + "from numpy import mean\n", + "from sklearn.model_selection import cross_val_score\n", + "from skopt.space import Integer\n", + "from skopt.utils import use_named_args\n", + "from skopt import gp_minimize" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " There are many warning messages while using the gp_minimize,\n", + " such as: UserWarning: The objective has been evaluated at this point before.\n", + "\n", + "This is to be expected and is caused by the same hyperparameter configuration being evaluated more than once." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# define the model\n", + "model_tree = DecisionTreeClassifier()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Search Space \n", + "It is used to set the parameter is going to tuned or the dimensionality on which wwe are apllying the hyperarameter tuning.
    \n", + "Each search dimension can be defined either as\n", + "\n", + "
  • a (lower_bound, upper_bound) tuple (for Real or Integer dimensions),
  • \n", + "\n", + "
  • a (lower_bound, upper_bound, \"prior\") tuple (for Real dimensions),
  • \n", + "\n", + "
  • as a list of categories (for Categorical dimensions), or
  • \n", + "\n", + "
  • an instance of a Dimension object (Real, Integer or Categorical).
  • \n", + "\n", + "Also you can refer to : https://scikit-optimize.github.io/stable/modules/generated/skopt.space.space.check_dimension.html" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# define the search space of hyperparameters to search\n", + "search_space = [Integer(1, 12, name='max_depth')]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### use_named_args & set_params\n", + "We can use the use_named_args() decorator from the scikit-optimize project on the function definition that allows the function to be called directly with a specific set of parameters from the search space.\n", + "\n", + "As such, our custom function will take the hyperparameter values as arguments, which can be provided to the model directly in order to configure it. We can define these arguments generically in python using the **params argument to the function, then pass them to the model via the set_params function." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# define the function used to evaluate a given configuration\n", + "@use_named_args(search_space)\n", + "def evaluate_model(**params):\n", + " # something\n", + " model_tree.set_params(**params)\n", + " # calculate 10-fold cross validation\n", + " result = cross_val_score(model_tree, x_train, y_train, cv=10, n_jobs=-1, scoring='accuracy')\n", + " # calculate the mean of the scores\n", + " estimate = mean(result)\n", + " return 1.0 - estimate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### gp_ minimize\n", + "Bayesian optimization using Gaussian Processes.
    \n", + "If every function evaluation is expensive, for instance when the parameters are the hyperparameters of a neural network and the function evaluation is the mean cross-validation score across ten folds, optimizing the hyperparameters by standard optimization routines would take for ever!
    \n", + "The idea is to approximate the function using a Gaussian process. In other words the function values are assumed to follow a multivariate gaussian. The covariance of the function values are given by a GP kernel between the parameters. Then a smart choice to choose the next parameter to evaluate can be made by the acquisition function over the Gaussian prior which is much quicker to evaluate.\n", + "https://scikit-optimize.github.io/stable/modules/generated/skopt.gp_minimize.html#skopt.gp_minimize" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# perform optimization\n", + "result = gp_minimize(evaluate_model, search_space)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best Accuracy: 1\n", + "Best Parameters: max_depth=12\n" + ] + } + ], + "source": [ + "print('Best Accuracy: %.f' % (1.0 - result.fun))\n", + "print('Best Parameters: max_depth=%d' % (result.x[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "model_tree= DecisionTreeClassifier( max_depth = result.x[0]).fit(x_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test data count: 114\n", + "accuracy_count : 102\n", + "accuracy_score : 0.8947368421052632\n", + "precision_score : 0.9354838709677419\n", + "recall_score : 0.8787878787878788\n" + ] + } + ], + "source": [ + "y_pred_tree = model_tree.predict(x_test)\n", + "summarize_classification(y_test, y_pred_tree)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZEAAAEYCAYAAACdnstHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3de5xdVX338c+XmWRyIYmamFEIklhSJNxURkCfiJFABGob9QXKReVRSsBKRdQWvBQt1WraWtBHFFIIIiBgUXSsSILAgBBAEuSScJEUiRlAAiQkGUIuk/yeP/Y+ycnkTOack3Obs7/v12tes886a++9Vs5kfrPW2mstRQRmZmbl2K3eBTAzs8HLQcTMzMrmIGJmZmVzEDEzs7I5iJiZWdkcRMzMrGwOIma2U5ImSgpJrfUuizUeBxEb1CSdLGmhpB5Jz0n6taSp9S5XVkn6mqSr610Oqx0HERu0JH0OuAj4V6AdeBPwfWBmPcuVz3+9W7NzELFBSdIY4ALg0xHxs4h4JSI2RcQvI+If0jxtki6S9Gz6dZGktvS9aZK6JX1e0oq0FfOJ9L3DJf1ZUkve/T4o6eH0eDdJ50n6X0kvSfqJpNel7+W6fk6T9CfgtjT945KWpfn/SdLTko4q4XqnSvqTpBclfTmvXC2SvpSeu1bSIkl7pe+9RdItklZKekLSh3fy79kl6ZuSfidptaRf5MpQIO8ekjrT6y6VdHqafgzwJeAjacvwobI+XBtUHERssHonMAy4cSd5vgwcDrwVOBg4FPhK3vtvAMYAewKnARdLem1E3Au8AhyZl/dk4Mfp8WeADwDvAfYAVgEX97n3e4D9gPdJmkLSQjoFeGPePXOKud5UYF9gOnC+pP3S9M8BJwHHAaOBTwLrJI0EbknLPD7N831J+/f7rwUfT8/fA+gFvttPvmuB7jTf8cC/SpoeETeTtAqvj4jdI+LgndzLmkVE+Mtfg+6L5BfynwfI87/AcXmv3wc8nR5PA14FWvPeXwEcnh5/HZibHo8iCSp7p68fA6bnnfdGYBPQCkwEAnhz3vvnA9fmvR4BbASOKuF6E/Le/x1wYnr8BDCzQN0/Avy2T9qlwFf7+bfqAr6V93pKWsaWvDK0AnsBm4FReXm/CfwwPf4acHW9fz78Vbsv99faYPUSME5Sa0T09pNnD2BZ3utladrWa/Q5dx2we3r8Y2CBpE8BHwIeiIjctfYGbpS0Je/czSTjMjnL+5Rj6+uIWCfppbz3i7nen/sp514kwbKvvYHDJL2cl9YKXFUgb6EyLwOGAOP65NkDWBkRa/vk7djJda2JuTvLBqt7gPUk3UD9eZbkl2nOm9K0AUXEoyS/HI9l+64sSH7ZHhsRr8n7GhYRz+RfIu/4OWBC7oWk4cDYEq/Xn+XAX/STfkefa+4eEZ/aybX2yjt+E0lr6MU+eZ4FXidpVJ+8ubJ6WfCMcRCxQSkiVpN0E10s6QOSRkgaIulYSf+WZrsW+Iqk10sal+Yv5fHTH5OMVxwB/Hde+iXANyTtDZBef2dPhN0A/LWkd0kaCvwzoF24Xr7LgH+RNFmJgySNBf4H+EtJH0v/XYZIekfeWEohH5U0RdIIkocWboiIzfkZImI5sAD4pqRhkg4iGU+6Js3yPDBRkn+3ZIQ/aBu0IuI/SQaWvwK8QPLX91nAz9MsXwcWAg8DjwAPpGnFupZk7OS2iMj/i/w7QCcwX9Ja4F7gsJ2Ucwnw98B1JK2StSTjLxvKuV4f/wn8BJgPrAEuB4an3U0zgBNJWg9/BmYDbTu51lXAD9O8w0gCaCEnkYyTPEvyYMNXI+KW9L1csH1J0gNF1sEGMUW49WlWS5J2B14GJkfEH+tdHkge8SUZEL+s3mWxwcUtEbMakPTXaZfbSOA/SFpGT9e3VGa7zkHErDZmknT/PAtMJnlE190ANui5O8vMzMrmloiZmZUtc5MNx40bFxMnTiw6/yuvvMLIkSOrV6AGlMU6QzbrncU6Qzbrvat1XrRo0YsR8fq+6ZkLIhMnTmThwoVF5+/q6mLatGnVK1ADymKdIZv1zmKdIZv13tU6S1pWKN3dWWZmVraaBRFJx6TLUS+VdF6B99skXZ++f5+kiWn6UElXSHpE0kOSpuWd05Ve88H0a3yt6mNmZjXqzlKyL8PFwNEkS0jfL6kzXZ8o5zRgVUTsI+lEktm1HwFOB4iIA9Mg8WtJ74iI3GJ1p0RE8f1TZmZWMbVqiRwKLI2IpyJiI8nyD33XBpoJXJke3wBMlySSJalvBYiIFSQzfb1iqJlZA6hVENmT7ZeZ7mb7TXm2y5Muz72aZKXTh4CZklolTQIOYfvVRq9Iu7L+KQ06ZmZWI7V6OqvQL/e+sxz7yzOXZIe4hSRLcy8g2XUNkq6sZ9JlqX8KfAz40Q43l2YBswDa29vp6uoquuA9PT1cdOlPueWeZ1i9diPD21qQYN36zRU7HjNqKPvuPZonlq3Z4R5jRg3l6HfuyVv3HTtwYSukp6enpH+jZpHFemexzpDNelerzrUKIt1s33qYwI77OuTydEtqJdlCdGW6NMQ5uUySFgBPAuT2W4iItZJ+TNJttkMQiYg5wByAjo6OKOUxt4su/Sm/vGM5GzYkcevVDdtWxq7U8eq1G/nd4hf7fe+Xdyxnyn77MeOIKUWXe1dk8fFHyGa9s1hnyGa9q1XnWnVn3Q9MljQp3U/hRJKlr/N1Aqemx8eTLL8deYvWIelooDciHk27t8al6UOA9wOLK13wW+55ZmsAqZcNG3q59Jq76loGM7NCatISiYheSWcB80j2bJ4bEUskXQAsjIhOkn0QrpK0FFhJEmgAxgPz0q1DnyHpsoJkX4R5aQBpAX4D/Fely7567cZKX7IsK15aU+8imJntoGYz1iPiJuCmPmnn5x2vB04ocN7TwL4F0l8hGWSvqjGjhjZEIBk/dnS9i2BmtgPPWB/A0e/ck7a2+q4O09bWyhmnTK1rGczMCnEQGcBb9x3LuWfOoH3caCQYvfswxowaVtHj9nGj+eD7Dt56jxHDh2y9f/u40Zx75oyaDaqbmZUicwswlmPGEVNq8kv88+n3xX94ljO/+GP2m/wG/utbH636fc3MyuWWSAMaMSxpiaxfv6nOJTEz2zkHkQY0fNhQAF51EDGzBucg0oCGpy0RBxEza3QOIg1oW0uk/o8Wm5ntjINIAxo6pIXddhMbN22mt3fzwCeYmdWJg0gDkrStS2uDu7TMrHE5iDQoD66b2WDgINKghvsxXzMbBBxEGlSuJbLOg+tm1sAcRBrUCD/ma2aDgINIg9o2V8QtETNrXA4iDWpYm1siZtb4HEQa1PDh6dNZrzqImFnjchBpULkxEQ+sm1kjcxBpUJ4nYmaDgYNIg9o6T8Qz1s2sgTmINCgvwmhmg4GDSIMavnVMxC0RM2tcNQsiko6R9ISkpZLOK/B+m6Tr0/fvkzQxTR8q6QpJj0h6SNK0Aud2Slpc9UrU0NZ5Iq+6JWJmjasmQURSC3AxcCwwBThJUt9Ny08DVkXEPsCFwOw0/XSAiDgQOBr4tqSt5Zb0IaCnujWoPW9MZWaDQa1aIocCSyPiqYjYCFwHzOyTZyZwZXp8AzBdkkiCzq0AEbECeBnoAJC0O/A54OtVr0GN+eksMxsMahVE9gSW573uTtMK5omIXmA1MBZ4CJgpqVXSJOAQYK/0nH8Bvg2sq17R68PLnpjZYNBao/uoQFoUmWcusB+wEFgGLAB6Jb0V2CcizsmNn/R7c2kWMAugvb2drq6uogve09NTUv5KeWHVegBWrlpd8/vXq871lsV6Z7HOkM16V6vOtQoi3WxrPQBMAJ7tJ0+3pFZgDLAyIgI4J5dJ0gLgSeA9wCGSniapx3hJXRExre/NI2IOMAego6Mjpk3bIUu/urq6KCV/pbzw0lq+c/Vi2G1Ize9frzrXWxbrncU6QzbrXa0616o7635gsqRJkoYCJwKdffJ0Aqemx8cDt0VESBohaSSApKOB3oh4NCJ+EBF7RMREYCrwh0IBZLDyPBEzGwxq0hKJiF5JZwHzgBZgbkQskXQBsDAiOoHLgaskLQVWkgQagPHAPElbgGeAj9WizPU2LO/prIggecbAzKyx1Ko7i4i4CbipT9r5ecfrgRMKnPc0sO8A134aOKAS5WwUrS27MXRICxs3bWbjxl7a0qXhzcwaiWesN7BtW+T6MV8za0wOIg3Mj/maWaNzEGlgXsnXzBqdg0gDc3eWmTU6B5EGtm0RRgcRM2tMDiINbITniphZg3MQaWDDvJKvmTU4B5EG5qezzKzROYg0MO9uaGaNzkGkgeXGRPyIr5k1KgeRBuYtcs2s0TmINDDvbmhmjc5BpIF5n3Uza3QOIg1s28C6u7PMrDE5iDQwd2eZWaNzEGlgnidiZo3OQaSBuSViZo3OQaSBeWDdzBqdg0gDGzHc3Vlm1tgcRBqYu7PMrNE5iDSwtqGtSLBhYy+bN2+pd3HMzHZQdBCRdIKkUenxVyT9TNLbSzj/GElPSFoq6bwC77dJuj59/z5JE9P0oZKukPSIpIckTcs75+Y0bYmkSyS1FFuewUASw9q8Ra6ZNa5SWiL/FBFrJU0F3gdcCfygmBPTX+4XA8cCU4CTJE3pk+00YFVE7ANcCMxO008HiIgDgaOBb0vKlfvDEXEwcADweuCEEuozKIxwl5aZNbBSgsjm9PtfAT+IiF8AQ4s891BgaUQ8FREbgeuAmX3yzCQJTAA3ANMliSTo3AoQESuAl4GO9PWaNH9rWpYooT6Dgmetm1kjay0h7zOS5gBHAbMltVF8ENoTWJ73uhs4rL88EdEraTUwFngImCnpOmAv4JD0++8AJM0jCVK/Jgk+O5A0C5gF0N7eTldXV5HFhp6enpLyV9rm3g0A3HX3vezx+hE1uWe961wvWax3FusM2ax3tepcShA5ATgG+LeIeFnSG4AvFHmuCqT1bTX0l2cusB+wEFgGLAB6t2aIeJ+kYcA1wJHALTtcJGIOMAego6Mjpk2bVmSxoauri1LyV9pPfvMcz734DPvvfxAHT5lQk3vWu871ksV6Z7HOkM16V6vOAwYRSWvZ9gtfQCS9TMkxMLqI+3STtB5yJgDP9pOnW1IrMAZYGREBnJNXngXAk/knRsR6SZ0kXWI7BJHBzN1ZZtbIBuyOiohRETE6/drhuMj73A9MljRJ0lDgRKCzT55O4NT0+HjgtogISSMkjQSQdDTQGxGPStpd0hvT9FbgOODxIsszaAxv86x1M2tcpXRnlS0d4zgLmAe0AHMjYomkC4CFEdEJXA5cJWkpsJIk0ACMB+ZJ2gI8A3wsTR8JdKZjMy3AbcAltahPLW2bcOiWiJk1nlK6swqOWRTbGomIm4Cb+qSdn3e8ngKP6EbE08C+BdKfB95RzL0HM6+fZWaNbMAgEhGjalEQK8xLn5hZIyupO0vSa4HJwLBcWkTcWelC2TbDh7slYmaNq+ggIulvgbNJnqx6EDgcuIfksVqrkuXPrATgRz+9l5/PexAJ1vSsZ9TIYQWPx48dzRmnTGXGEX0XBDAzq7xSZqyfTTIGsSwi3gu8DXihKqUyAObf+Si3Lnhi6+s1PetZvXY9Ef0fP//iGmZfMp/5dz5ax5KbWVaUEkTWp4PfSGqLiMcpMOBtlXPpNXfR21v66r0bNvRy6TV3VaFEZmbbK2VMpFvSa4CfA7dIWsWOEwatgla8tGbgTFU418ysWEUHkYj4YHr4NUm3k8wov7kqpTIAxo8dzfMvlhcMxo8tdh6omVn5ytqUKiLuiIjOdEVeq5IzTplKW1vp80Hb2lo545SpVSiRmdn2StmU6sq0Oyv3+rWS5lanWAYw44gpnHvmDNrHjUaC0bsPY8yoYQWPc8aPHcW5Z87w01lmVhOl/Jl7UES8nHsREaskva0KZbI8M46YUlRA+NCsS1nx0lp+8K8n0T7OXVlmVhuldGftlk42BEDS66jR2ls2sK2r/b7qHkYzq51SgsC3gQWSbiBZS+vDwDeqUiormdfYMrN6KOXprB9JWkgyQ13AhyLCM9oahNfYMrN6KKk7Kg0aDhwNaFtLxN1ZZlY7ZT3ia40n1xJZ55aImdWQg0iTGJFb7dcD62ZWQ6Ws4nskcArwMrAYeBhYHBEbqlQ2K4HHRMysHkoZE7ka+HR6zkHAB4D9gX2qUC4r0QiPiZhZHZQSRJZGxI3p8X9XozBWvuHD0zERd2eZWQ2VMiZyh6RzJBXaa93qzPNEzKweSmmJ7A8cAJwraRHJ7oYPRoRbJQ3AYyJmVg9Ft0Qi4kMR8ZfAJOCrwJPAYcWeL+kYSU9IWirpvALvt0m6Pn3/PkkT0/Shkq6Q9IikhyRNS9NHSPqVpMclLZH0rWLL0oxyYyLrPCZiZjVU8tpXEfEqsDD9KoqkFuBi4GigG7hfUmefGe+nAasiYh9JJwKzgY8Ap6f3PVDSeODXkt6RnvMfEXG7pKHArZKOjYhfl1qnZuCWiJnVQ63miRxKMjD/VLoHyXXAzD55ZgJXpsc3ANPT8ZcpwK0AEbGC5BHjjohYFxG3p+kbgQeACVWvSYPyjHUzq4darcK7J7A873U3O3aFbc0TEb2SVgNjgYeAmZKuA/YCDkm//y53YrrPyV8D3yl0c0mzgFkA7e3tdHV1FV3wnp6ekvLXy59fXAfAiy+9vMvlHSx1rrQs1juLdYZs1rtadS4qiKQtggkRsXzAzP1cokBaFJlnLrAfSffZMmAB0JtXtlbgWuC7EfFUoZtHxBxgDkBHR0dMmzat6IJ3dXVRSv56efb5l/netY+yW8uQXS7vYKlzpWWx3lmsM2Sz3tWqc1FBJCJC0s9JWgHl6CZpPeRMAJ7tJ093GhjGACsjIoBzcpkkLSAZ1M+ZAzwZEReVWbamsHXtrFc9JmJmtVPKmMi9eQPapbofmCxpUjoIfiLQ2SdPJ3Bqenw8cFsavEZIGgkg6WigNzcgL+nrJMHms2WWq2l4xrqZ1UMpYyLvBc6U9DTwCkn3U0TEQQOdmI5xnAXMA1qAuRGxRNIFwMKI6AQuB66StBRYSRJoAMYD8yRtAZ4BPgYgaQLwZeBx4IF0DuT3IuKyEurUNIYObWW33cTGTZvp7d1Ma2tLvYtkZhlQShA5dlduFBE3ATf1STs/73g9cEKB854G9i2Q3k3hcZRMksTwYUN4Zd1GXt2wiVEOImZWA6V0Z/0JeDdwakQsIxn0bq9KqawsnitiZrVWShD5PvBO4KT09VqSCYTWILbOFfEijGZWI6V0Zx0WEW+X9HuAiFiVDpJbgxgx3C0RM6utUloim9LlSwJA0uuBLVUplZUl1xLxcvBmViulBJHvAjcC4yV9A7gL+GZVSmVlGeF91s2sxoruzoqIa9Il4KeTPBX1gYh4rGols5J5/Swzq7VS9lifHRHnkszL6JtmDcBPZ5lZrZXSnXV0gbRdmjtileWWiJnV2oAtEUmfAv4OeLOkh/PeGgXcXa2CWencEjGzWiumO+s44P3AEyTLreesjYiVVSmVlWW4dzc0sxorJoj8Rfr9CWANeUuNSHqdA0nj8DwRM6u1YoLIJcDNJHurL2L79aoCeHMVymVlGOF5ImZWYwMOrEfEdyNiP+CKiHhzREzK+3IAaSAeEzGzWitlnsinJL0WmAwMy0u/sxoFs9L56Swzq7VS5on8LXA2ya6EDwKHA/cAR1anaFaq4R4TMbMaK2WeyNnAO4BlEfFe4G3AC1UplZXFq/iaWa2VEkTWpxtHIaktIh6nwGZRVj9eO8vMaq2UpeC7Jb0G+Dlwi6RVwLPVKZaVw2MiZlZrpQysfzA9/Jqk24ExJI/+WoPwPBEzq7VSWiJbRcQdlS6I7bq2oa1IsGFjL72bt9DaUkpvpZlZ6fxbpolI2jpXZL1bI2ZWAzULIpKOkfSEpKWSzivwfpuk69P375M0MU0fKukKSY9IekjStLxzviFpuaSeWtWj0XlcxMxqqeQgImlkuk1uKee0ABeTLB0/BThJ0pQ+2U4DVkXEPsCFwOw0/XSAiDiQZDn6b0vKlfuXwKGl1qGZeda6mdXSgEFE0m6STpb0K0krSDalek7SEkn/LmlyEfc5FFgaEU9FxEbgOmBmnzwzgSvT4xuA6ZJEEnRuBYiIFcDLQEf6+t6IeK6I+2fGCK/ka2Y1VMzA+u3Ab4AvAosjYgskK/gC7wW+JenGiLh6J9fYE1ie97obOKy/PBHRK2k1MBZ4CJgp6TpgL+CQ9Pvviig7aVlnAbMA2tvb6erqKvZUenp6Sspfbxs2rAPgnnvv58/LR5V1jcFW50rJYr2zWGfIZr2rVedigshREbFD30i6BPxPgZ9KGjLANVQgLYrMMxfYD1gILAMWAL0DFbpPWecAcwA6Ojpi2rRpRZ/b1dVFKfnr7Vd3v8SyZ3vY9y1TeNchfzHwCQUMtjpXShbrncU6QzbrXa06F7OK7yYASRel3Uv95tmJbpLWQ84EdpyouDWPpFaSeSgrI6I3Is6JiLdGxEzgNcCTA5U7qzwmYma1VMrAeg/QKWkkgKQZkordHvd+YLKkSZKGAicCnX3ydAKnpsfHA7dFREgakXfPo4HeiHi0hHJnynDvKWJmNVTKjPWvSDoZ6JK0AXgF2OFR3X7O7ZV0FjAPaAHmRsQSSRcACyOiE7gcuErSUmAlSaABGA/Mk7QFeAb4WO66kv4NOBkYIakbuCwivlZsnZqRWyJmVkulLAU/neRx21eANwKnRcQTxZ4fETcBN/VJOz/veD1wQoHznqafhR4j4h+Bfyy2DFngfdbNrJZK6c76MvBPETGNpLvpekneS6TBbF0/61W3RMys+krpzjoy7/gRSceSPJ31rmoUzMrjGetmVkvFTDbs74ms54DpO8tjtbctiLglYmbVV0x31u2S/l7Sm/IT06es3inpSrY9VWV1tm05eLdEzKz6iunOOgb4JHCtpEkky44MI3nKaj5wYUQ8WL0iWilyT2et85iImdVAMUFkdkScLemHwCZgHPBqRLxc1ZJZWTwmYma1VEx31vT0+28jYlNEPOcA0ri8z7qZ1VIxLZGbJd0DvEHSJ0kWRFySzuuwBrPokWUA/O+yFzju1O8hwZqe9YwaOazo49Vr1zP6isUF84wfO5ozTpnKjCP6ruRvZlk0YBCJiC9IejPQBUwC/gbYX9JGklV9P1LdIlqx5t/5KP917baVaNb0rK/48fMvruEb3/s135l7W8nBqRbH48eO5l2HTGLBoj+y4qU1FQuejVbWcu/nPwCs0oqaJxIRT0k6KiL+kEuTtDtwQNVKZiW79Jq72LCxpAWOy7J5c7B6bRJYqhGodjXI3TjvobqXoxHL+vyLa5h9yXwABxKrmKInGwLL0rWzJvY5796KlsjKtuKlNfUugjW4DRt6ufSauxxErGJKWfbkFyS7D/aSrJ+V+7IGMX7s6HoXwQYB/7FhlVRKS2RCRBxTtZLYLjvjlKnMvmQ+GzZUv0vLBi//sWGVVEpLZIGkA6tWEttlM46YwrlnzqB93GgkGL37MMaMGlbyMew8z5DWUn5srJG0tbVyxilT610MayKltESmAv9X0h+BDSTb2UZEHFSVkllZZhwxZZf7uwfaRnP+nY9y6TV31eSJopo/nbX74ChrKcfr1m+kt3cLY0YP5+xPvNfjIVZRpQSRY6tWChtUKhGoqu3zZZxTr323yylrKf7luzcx745HOevj72n4z80Gn1KWgl9WzYKYWXVs26jMqxhY5RWzFPxd6fe1ktak33NffszDrMGNGOaVna16ipmxPjX9Pqr6xTGzSvMeM1ZNpeyx3gF8iT6TDT2wbtbYtu0x4yBilVfKwPo1wD8AjwBbqlMcM6u0bXvMuDvLKq+UB/5fiIjOiPhjRCzLfRV7sqRjJD0haamk8wq83ybp+vT9+yRNTNOHSrpC0iOSHpI0Le+cQ9L0pZK+6216zXbkPWasmkoJIl+VdJmkkyR9KPdVzImSWoCLSR4TngKcJKnvs4anAasiYh/gQmB2mn46QEQcCBwNfFtSrtw/AGYBk9Mvz6g362P4MHdnWfWU0p31CeAtwBC2dWcF8LMizj0UWBoRTwFIuo5kHa5H8/LMBL6WHt8AfC9tWUwBbgWIiBWSXgY6JC0HRkfEPek1fwR8APh1CXUya3ojtj7i65aIVV4pQeTgtDVQjj2B5Xmvu4HD+ssTEb2SVgNjSTbBmpkGnr2AQ9LvW9Lr5F9zz0I3lzSLpMVCe3s7XV1dRRe8p6enpPzNIIt1huatd/fzyTqpK15YuUP9mrXOA8livatV51KCyL2SpkTEowNn3UGhsYooMs9cYD9gIbAMWECyknAx10wSI+YAcwA6OjqilFnJ9ZrFXE9ZrDM0b72f7n6JS37yGK2tw3aoX7PWeSBZrHe16lzq2lmnlrl2VjdJ6yFnAvBsP3m6JbUCY4CVERHAOblMkhYATwKr0uvs7JpmmeeBdaumUoLIrgxa3w9MljQJeAY4ETi5T55O4FTgHuB44LaICEkjAEXEK5KOBnpzraF01vzhwH3Ax4H/twtlNGtKnidi1VSTtbPSMY6zgHlACzA3IpZIugBYGBGdwOXAVZKWAitJAg3AeGCepC0kAehjeZf+FPBDYDjJgLoH1c36GN62bWA9IvCT8FZJpbREdklE3ATc1Cft/Lzj9cAJBc57Gti3n2suxPu8m+1Ua2sLQ4e0sHHTZjZu7KUtDSpmleDdhcwyYOusdXdpWYU5iJhlgAfXrVocRMwywCv5WrU4iJhlgJc+sWpxEDHLAHdnWbU4iJhlQG6uiAfWrdIcRMwyYIT3FLEqcRAxywB3Z1m1OIiYZYCfzrJqcRAxy4DhufWz3J1lFeYgYpYBbolYtTiImGWAlz2xanEQMcuAbcvBuzvLKstBxCwDRrg7y6rEQcQsA4Z7nohViYOIWQZ4nohVi4OIWQb46SyrFgcRswzwKr5WLQ4iZhmQG1hf5+4sqzAHEbMMyG+JRESdS2PNxEHELAOGDGlhSGsLmzdvYVPv5noXx5pIzYKIpGMkPSFpqaTzCrzfJun69P37JE1M04dIulLSI5Iek/TFvHPOlrRY0hJJn61VXcwGo9zguh/ztUqqSRCR1JMJn/MAAAzlSURBVAJcDBwLTAFOkjSlT7bTgFURsQ9wITA7TT8BaIuIA4FDgDMkTZR0AHA6cChwMPB+SZOrXxuzwclPaFk11KolciiwNCKeioiNwHXAzD55ZgJXpsc3ANMlCQhgpKRWYDiwEVgD7AfcGxHrIqIXuAP4YPWrYjY4bRsXcUvEKqe1RvfZE1ie97obOKy/PBHRK2k1MJYkoMwEngNGAOdExEpJi4FvSBoLvAocBywsdHNJs4BZAO3t7XR1dRVd8J6enpLyN4Ms1hmav969m9YDcPeC+/jTU7sDzV/n/mSx3tWqc62CiAqk9X1EpL88hwKbgT2A1wK/lfSbiHhM0mzgFqAHeAjoLXTziJgDzAHo6OiIadOmFV3wrq4uSsnfDLJYZ2j+et/YtYLu51/hLfsdyDsO3hto/jr3J4v1rlada9Wd1Q3slfd6AvBsf3nSrqsxwErgZODmiNgUESuAu4EOgIi4PCLeHhFHpHmfrGotzAYxL31i1VCrIHI/MFnSJElDgROBzj55OoFT0+PjgdsieaD9T8CRSowEDgceB5A0Pv3+JuBDwLVVr4nZIOVZ61YNNenOSsc4zgLmAS3A3IhYIukCYGFEdAKXA1dJWkrSqjgxPf1i4ApgMUmX1xUR8XD63k/TMZFNwKcjYlUt6mM2GHlPEauGWo2JEBE3ATf1STs/73g9yeO8fc/rKZSevvfuChfTrGl5nohVg2esm2WE54lYNTiImGXEtn3W3RKxynEQMcsIb5Fr1eAgYpYRfjrLqsFBxCwjPE/EqsFBxCwjtj3i65aIVY6DiFlGeGDdqsFBxCwjtnZneZ6IVZCDiFlGbNtn3d1ZVjkOImYZMdxjIlYFDiJmGeGns6waHETMMmJIawstLbvR27uFTZs217s41iQcRMwyQpJbI1ZxNVvF18zqa/6dj25dwffDn76Mlt3E6rXrGX3FYiRY07OeUSOHbT0eP3Y07zpkEgsW/ZEVL63Z7r1GOy61rDurdyOUrxrHq9eup/3aP3DGKVOZccSUiv1cKdn3KTs6Ojpi4cKCW7EX5G00s6OZ6z3/zkeZfcl8NmwouIO0ZUhbWyvnnjmj5EAiaVFEdPRNd3eWWQZces1dDiAGwIYNvVx6zV0Vu56DiFkGrHhpTb2LYA2kkj8PDiJmGTB+7Oh6F8EaSCV/HhxEzDLgjFOm0tbm52gsGRM545SpFbueg4hZBsw4YgrnnjmD9nGjkWD07sMYM2oYsO04P12C9nGj+eD7Dt7hnEY8LrWsO6t3I5SvGseQlKOcQfWd8Z8mZhkx44gpO/zyKOaJtM9XsUyVVmxZ6/UkXj3/LatV55q1RCQdI+kJSUslnVfg/TZJ16fv3ydpYpo+RNKVkh6R9JikL+adc46kJZIWS7pW0rBa1cfMzGoURCS1ABcDxwJTgJMk9W1PnQasioh9gAuB2Wn6CUBbRBwIHAKcIWmipD2BzwAdEXEA0AKcWP3amJlZTq1aIocCSyPiqYjYCFwHzOyTZyZwZXp8AzBdkoAARkpqBYYDG4Hc82mtwPD0vRHAs9WthpmZ5avVmMiewPK8193AYf3liYheSauBsSQBZSbwHEmgOCciVgJI+g/gT8CrwPyImF/o5pJmAbMA2tvb6erqKrrgPT09JeVvBlmsM2Sz3lmsM2Sz3tWqc62CiAqk9V1vpb88hwKbgT2A1wK/lfQbYBVJcJkEvAz8t6SPRsTVO1wkYg4wB5JlT0oZXGrmpTD6k8U6QzbrncU6QzbrXa061yqIdAN75b2ewI5dT7k83Wn31BhgJXAycHNEbAJWSLob6CAJMH+MiBcAJP0MeBewQxDJt2jRohclLSuh7OOAF0vI3wyyWGfIZr2zWGfIZr13tc57F0qsVRC5H5gsaRLwDMkA+Ml98nQCpwL3AMcDt0VESPoTcKSkq0m6sw4HLiIZHzlc0giS7qzpwIArK0bE60spuKSFhRYda2ZZrDNks95ZrDNks97VqnNNgkg6xnEWMI/kKaq5EbFE0gXAwojoBC4HrpK0lKQFknvS6mLgCmAxSZfXFRHxMICkG4AHgF7g96RdVmZmVhuZWwq+VP6LJTuyWO8s1hmyWe9q1dnLngwsi62bLNYZslnvLNYZslnvqtTZLREzMyubWyJmZlY2BxEzMyubg0g/BlowsllI2kvS7enilksknZ2mv07SLZKeTL+/tt5lrTRJLZJ+L+l/0teT0sU/n0wXAx1a7zJWmqTXSLpB0uPpZ/7OZv+sCy3U2oyftaS5klZIWpyXVvCzVeK76e+3hyW9vdz7OogUUOSCkc2iF/h8ROxHMgfn02ldzwNujYjJwK3p62ZzNvBY3uvZwIVpnVeRLArabL5DMnn3LcDBJPVv2s96Jwu1NuNn/UPgmD5p/X22xwKT069ZwA/KvamDSGHFLBjZFCLiuYh4ID1eS/JLZU+2XxDzSuAD9SlhdUiaAPwVcFn6WsCRJGu1QXPWeTRwBMmcLCJiY0S8TJN/1uy4UOtzNOFnHRF3ksyxy9ffZzsT+FEk7gVeI+mN5dzXQaSwQgtG7lmnstRMuofL24D7gPaIeA6SQAOMr1/JquIi4B+BLenrscDLEdGbvm7Gz/zNwAvAFWk33mWSRtLEn3VEPAPkFmp9DlgNLKL5P+uc/j7biv2OcxAprJgFI5uKpN2BnwKfjYg1A+UfzCS9H1gREYvykwtkbbbPvBV4O/CDiHgb8ApN1HVVSDoGkFuodQ9gJElXTl/N9lkPpGI/7w4ihRWzYGTTkDSEJIBcExE/S5OfzzVv0+8r6lW+Kvg/wN9Iepqkq/JIkpbJa9IuD2jOz7wb6I6I+9LXN5AElWb+rI8iXag1XcQ1t1Brs3/WOf19thX7HecgUtjWBSPTpzZOJFkgsumkYwGXA49FxH/mvZVbEJP0+y9qXbZqiYgvRsSEiJhI8tneFhGnALeTLP4JTVZngIj4M7Bc0r5p0nTgUZr4sybpxjpc0oj0Zz1X56b+rPP099l2Ah9Pn9I6HFid6/YqlWes90PScSR/neYWjPxGnYtUFZKmAr8FHmHb+MCXSMZFfgK8ieQ/4gm5zcCaiaRpwBci4v2S3kzSMnkdyYKeH42IDfUsX6VJeivJwwRDgaeAT5D8Mdm0n7WkfwY+wraFWv+WpP+/qT5rSdcC00iWfH8e+Crwcwp8tmlA/R7J01zrgE9ExICroBe8r4OImZmVy91ZZmZWNgcRMzMrm4OImZmVzUHEzMzK5iBiZmZlcxAxM7OyOYiYmVnZHESsqUkKSd/Oe/0FSV+rwHUn5u/bUE2SPpPu/XHNLl6np9Cx2a5wELFmtwH4kKRx9S5IvnS5iWL///0dcFy6NItZQ3EQsWbXC8wBzslP7NuSyLVQ0vTH02XSF0u6RtJRku5Od4c7NO8yrZKuTHeGu0HSiPRaH5X0O0kPSro03eQsd8/HJH0feIDtF8BD0ufSey6W9Nk07RKSJdw7JW1Xh/T9j6f3f0jSVWnazyUtUrKb36yd/eNIGinpV+n5iyV9pECeGyV9XdJvJf1Z0lE7u6Zli4OIZcHFwCmSxhSZfx+SHQAPAt4CnAxMBb5Asq5Yzr7AnIg4CFgD/J2k/UjWafo/EfFWYDNwSp9zfhQRb4uIZblESYeQrGN1GMkOk6dLeltEnEmyuup7I+LC/EJK2h/4MnBkRBxMslMjwCcj4hCgA/iMpLE7qesxwLMRcXC689/NBfIcQLL/xrtJWkVuEdlWDiLW9NL9UX5Esk1qMf4YEY9ExBZgCcn2okGySOXEvHzLI+Lu9PhqkkAzHTgEuF/Sg+nrN+edsyzdSa6vqcCNEfFKRPSQLFn+7gHKeSRwQ0S8mNYzt2jiZyQ9BNxL0tqZvJNrPAIcJWm2pHdHxOr8N9PW1RggF8BagZcHKJdlSOvAWcyawkUkXUhXpK972f6PqGF5x/mruW7Je72F7f/P9F29NEg2+7kyIr7YTzle6Se90CZBA1HfMqSrEh8FvDMi1knqYvu6bSci/pC2go4DvilpfkRckJdlf2BRRGxOXx8E1OSBAhsc3BKxTEj/Sv8JcFqa9DwwXtJYSW3A+8u47JskvTM9Pgm4C7gVOF7SeABJr5O0dxHXuhP4QLrvxUjggyRL9O/MrcCHc91Vkl5H0mpYlQaQt5B0jfVL0h7Auoi4mmQb2bf3yXIA8GDe64OAh4uoj2WEWyKWJd8GzgKIiE2SLiDZN+WPwONlXO8x4FRJlwJPkmw7u07SV4D56dNXm4BPA8t2ch0i4gFJPwR+lyZdFhG/H+CcJZK+AdwhaTPJvhhnAGdKehh4gqRLa2cOBP5d0pa0rJ8q8P59ea8PwC0Ry+P9RMzMrGzuzjIzs7I5iJiZWdkcRMzMrGwOImZmVjYHETMzK5uDiJmZlc1BxMzMyvb/Ab/rsy9GNWffAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
    " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from skopt.plots import plot_convergence\n", + "plot_convergence(result);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Now using the KNeighborsClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.neighbors import KNeighborsClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best Accuracy: 1\n", + "Best Parameters: n_neighbors=3, p=1\n" + ] + } + ], + "source": [ + "# define the model\n", + "model_kn =KNeighborsClassifier()\n", + "\n", + "\n", + "# define the search space of hyperparameters to search\n", + "search_space = [Integer(1, 12, name='n_neighbors'), Integer(1, 3, name='p')]\n", + "\n", + "# define the function used to evaluate a given configuration\n", + "@use_named_args(search_space)\n", + "def evaluate_model(**params):\n", + " # something\n", + " model_kn.set_params(**params)\n", + " # calculate 10-fold cross validation\n", + " result = cross_val_score(model_kn, x_train, y_train, cv=10, n_jobs=-1, scoring='accuracy')\n", + " # calculate the mean of the scores\n", + " estimate = mean(result)\n", + " return 1.0 - estimate\n", + "\n", + "# perform optimization\n", + "result = gp_minimize(evaluate_model, search_space)\n", + "# summarizing finding:\n", + "print('Best Accuracy: %.f' % (1.0 - result.fun))\n", + "print('Best Parameters: n_neighbors=%d, p=%d' % (result.x[0], result.x[1]))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "model_kn = KNeighborsClassifier( n_neighbors = result.x[0],p=result.x[1]).fit(x_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test data count: 114\n", + "accuracy_count : 98\n", + "accuracy_score : 0.8596491228070176\n", + "precision_score : 0.8571428571428571\n", + "recall_score : 0.9090909090909091\n" + ] + } + ], + "source": [ + "y_pred = model_kn.predict(x_test)\n", + "summarize_classification(y_test, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
    " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from skopt.plots import plot_convergence\n", + "plot_convergence(result);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import tree, model_selection" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'fit_time': array([0.00612926, 0.00793409, 0.00648522, 0.00653386, 0.00688672,\n", + " 0.00996494, 0.00953984, 0.01045251, 0.00822282, 0.0082016 ]),\n", + " 'score_time': array([0.0010345 , 0.00200129, 0.00301623, 0.0030067 , 0.00308418,\n", + " 0.004915 , 0.00572562, 0.00488901, 0.00896358, 0.00399947]),\n", + " 'test_score': array([0.91304348, 0.95652174, 0.82608696, 0.80434783, 0.91304348,\n", + " 0.95555556, 0.91111111, 0.86666667, 0.86666667, 0.97777778])}" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classifer = DecisionTreeClassifier()\n", + "classifer.fit(x_train,y_train)\n", + "results = model_selection.cross_validate(classifer, x_train, y_train, cv = 10)\n", + "\n", + "results" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test Mean Score : 0.8990821256038647\n" + ] + } + ], + "source": [ + "print(\"Test Mean Score :\",results.get('test_score').mean())\n", + "print(\"Train Mean Score :\",results.get('train_score').mean())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 5da7a1db7f6d4a429f64175eda14c2168599ca52 Mon Sep 17 00:00:00 2001 From: Kalash Jindal <37014842+erickeagle@users.noreply.github.com> Date: Thu, 1 Oct 2020 15:16:01 +0530 Subject: [PATCH 2/2] Add files via upload --- .../01-ExploringTheTitanicDataset.ipynb | 1788 +++++++++++++++++ ...ion_LogisticRegression_Titanic-Copy1.ipynb | 834 ++++++++ ...ification_LogisticRegression_Titanic.ipynb | 679 +++++++ ...MultipleClassificationModels_Titanic.ipynb | 1171 +++++++++++ ...4-HyperparameterTuningWithGridSearch.ipynb | 485 +++++ 5 files changed, 4957 insertions(+) create mode 100644 Machine Learning/01-ExploringTheTitanicDataset.ipynb create mode 100644 Machine Learning/02-BinaryClassification_LogisticRegression_Titanic-Copy1.ipynb create mode 100644 Machine Learning/02-BinaryClassification_LogisticRegression_Titanic.ipynb create mode 100644 Machine Learning/03-MultipleClassificationModels_Titanic.ipynb create mode 100644 Machine Learning/04-HyperparameterTuningWithGridSearch.ipynb diff --git a/Machine Learning/01-ExploringTheTitanicDataset.ipynb b/Machine Learning/01-ExploringTheTitanicDataset.ipynb new file mode 100644 index 00000000..0d3fff52 --- /dev/null +++ b/Machine Learning/01-ExploringTheTitanicDataset.ipynb @@ -0,0 +1,1788 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already up-to-date: scikit-learn in /anaconda3/lib/python3.7/site-packages (0.20.2)\r\n", + "Requirement already satisfied, skipping upgrade: numpy>=1.8.2 in /anaconda3/lib/python3.7/site-packages (from scikit-learn) (1.16.1)\r\n", + "Requirement already satisfied, skipping upgrade: scipy>=0.13.3 in /anaconda3/lib/python3.7/site-packages (from scikit-learn) (1.2.1)\r\n" + ] + } + ], + "source": [ + "!pip install -U scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.20.2\n" + ] + } + ], + "source": [ + "print(sklearn.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.16.1\n" + ] + } + ], + "source": [ + "print(np.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.23.4\n" + ] + } + ], + "source": [ + "print(pd.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The Titanic dataset\n", + "\n", + "Source: https://www.kaggle.com/francksylla/titanic-machine-learning-from-disaster" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
    0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
    1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
    2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
    3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
    4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
    5603Moran, Mr. JamesmaleNaN003308778.4583NaNQ
    6701McCarthy, Mr. Timothy Jmale54.0001746351.8625E46S
    7803Palsson, Master. Gosta Leonardmale2.03134990921.0750NaNS
    8913Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)female27.00234774211.1333NaNS
    91012Nasser, Mrs. Nicholas (Adele Achem)female14.01023773630.0708NaNC
    \n", + "
    " + ], + "text/plain": [ + " PassengerId Survived Pclass \\\n", + "0 1 0 3 \n", + "1 2 1 1 \n", + "2 3 1 3 \n", + "3 4 1 1 \n", + "4 5 0 3 \n", + "5 6 0 3 \n", + "6 7 0 1 \n", + "7 8 0 3 \n", + "8 9 1 3 \n", + "9 10 1 2 \n", + "\n", + " Name Sex Age SibSp \\\n", + "0 Braund, Mr. Owen Harris male 22.0 1 \n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", + "2 Heikkinen, Miss. Laina female 26.0 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", + "4 Allen, Mr. William Henry male 35.0 0 \n", + "5 Moran, Mr. James male NaN 0 \n", + "6 McCarthy, Mr. Timothy J male 54.0 0 \n", + "7 Palsson, Master. Gosta Leonard male 2.0 3 \n", + "8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 \n", + "9 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 \n", + "\n", + " Parch Ticket Fare Cabin Embarked \n", + "0 0 A/5 21171 7.2500 NaN S \n", + "1 0 PC 17599 71.2833 C85 C \n", + "2 0 STON/O2. 3101282 7.9250 NaN S \n", + "3 0 113803 53.1000 C123 S \n", + "4 0 373450 8.0500 NaN S \n", + "5 0 330877 8.4583 NaN Q \n", + "6 0 17463 51.8625 E46 S \n", + "7 1 349909 21.0750 NaN S \n", + "8 2 347742 11.1333 NaN S \n", + "9 0 237736 30.0708 NaN C " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_df = pd.read_csv('titanic_train.csv')\n", + "\n", + "titanic_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(891, 12)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    SurvivedPclassSexAgeSibSpParchFareEmbarked
    003male22.0107.2500S
    111female38.01071.2833C
    213female26.0007.9250S
    311female35.01053.1000S
    403male35.0008.0500S
    \n", + "
    " + ], + "text/plain": [ + " Survived Pclass Sex Age SibSp Parch Fare Embarked\n", + "0 0 3 male 22.0 1 0 7.2500 S\n", + "1 1 1 female 38.0 1 0 71.2833 C\n", + "2 1 3 female 26.0 0 0 7.9250 S\n", + "3 1 1 female 35.0 1 0 53.1000 S\n", + "4 0 3 male 35.0 0 0 8.0500 S" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], 'columns', inplace=True)\n", + "titanic_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Survived 179\n", + "Pclass 179\n", + "Sex 179\n", + "Age 2\n", + "SibSp 179\n", + "Parch 179\n", + "Fare 179\n", + "Embarked 177\n", + "dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_df[titanic_df.isnull().any(axis=1)].count()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "titanic_df = titanic_df.dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(712, 8)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Survived 0\n", + "Pclass 0\n", + "Sex 0\n", + "Age 0\n", + "SibSp 0\n", + "Parch 0\n", + "Fare 0\n", + "Embarked 0\n", + "dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_df[titanic_df.isnull().any(axis=1)].count()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    SurvivedPclassAgeSibSpParchFare
    count712.000000712.000000712.000000712.000000712.000000712.000000
    mean0.4044942.24016929.6420930.5140450.43258434.567251
    std0.4911390.83685414.4929330.9306920.85418152.938648
    min0.0000001.0000000.4200000.0000000.0000000.000000
    25%0.0000001.00000020.0000000.0000000.0000008.050000
    50%0.0000002.00000028.0000000.0000000.00000015.645850
    75%1.0000003.00000038.0000001.0000001.00000033.000000
    max1.0000003.00000080.0000005.0000006.000000512.329200
    \n", + "
    " + ], + "text/plain": [ + " Survived Pclass Age SibSp Parch Fare\n", + "count 712.000000 712.000000 712.000000 712.000000 712.000000 712.000000\n", + "mean 0.404494 2.240169 29.642093 0.514045 0.432584 34.567251\n", + "std 0.491139 0.836854 14.492933 0.930692 0.854181 52.938648\n", + "min 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000\n", + "25% 0.000000 1.000000 20.000000 0.000000 0.000000 8.050000\n", + "50% 0.000000 2.000000 28.000000 0.000000 0.000000 15.645850\n", + "75% 1.000000 3.000000 38.000000 1.000000 1.000000 33.000000\n", + "max 1.000000 3.000000 80.000000 5.000000 6.000000 512.329200" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    SurvivedPclassSexAgeSibSpParchFareEmbarked
    003male22.0107.2500S
    111female38.01071.2833C
    213female26.0007.9250S
    311female35.01053.1000S
    403male35.0008.0500S
    \n", + "
    " + ], + "text/plain": [ + " Survived Pclass Sex Age SibSp Parch Fare Embarked\n", + "0 0 3 male 22.0 1 0 7.2500 S\n", + "1 1 1 female 38.0 1 0 71.2833 C\n", + "2 1 3 female 26.0 0 0 7.9250 S\n", + "3 1 1 female 35.0 1 0 53.1000 S\n", + "4 0 3 male 35.0 0 0 8.0500 S" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Visualizing relationships" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'Survived')" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
    " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(12, 8))\n", + "plt.scatter(titanic_df['Age'], titanic_df['Survived'])\n", + "plt.xlabel('Age')\n", + "plt.ylabel('Survived')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'Survived')" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
    " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(12, 8))\n", + "plt.scatter(titanic_df['Fare'], titanic_df['Survived'])\n", + "plt.xlabel('Fare')\n", + "plt.ylabel('Survived')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    Survived01
    Sex
    female64195
    male36093
    \n", + "
    " + ], + "text/plain": [ + "Survived 0 1\n", + "Sex \n", + "female 64 195\n", + "male 360 93" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.crosstab(titanic_df['Sex'], titanic_df['Survived'])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    Survived01
    Pclass
    164120
    29083
    327085
    \n", + "
    " + ], + "text/plain": [ + "Survived 0 1\n", + "Pclass \n", + "1 64 120\n", + "2 90 83\n", + "3 270 85" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.crosstab(titanic_df['Pclass'], titanic_df['Survived'])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    SurvivedPclassAgeSibSpParchFare
    Survived1.000000-0.356462-0.082446-0.0155230.0952650.266100
    Pclass-0.3564621.000000-0.3659020.0651870.023666-0.552893
    Age-0.082446-0.3659021.000000-0.307351-0.1878960.093143
    SibSp-0.0155230.065187-0.3073511.0000000.3833380.139860
    Parch0.0952650.023666-0.1878960.3833381.0000000.206624
    Fare0.266100-0.5528930.0931430.1398600.2066241.000000
    \n", + "
    " + ], + "text/plain": [ + " Survived Pclass Age SibSp Parch Fare\n", + "Survived 1.000000 -0.356462 -0.082446 -0.015523 0.095265 0.266100\n", + "Pclass -0.356462 1.000000 -0.365902 0.065187 0.023666 -0.552893\n", + "Age -0.082446 -0.365902 1.000000 -0.307351 -0.187896 0.093143\n", + "SibSp -0.015523 0.065187 -0.307351 1.000000 0.383338 0.139860\n", + "Parch 0.095265 0.023666 -0.187896 0.383338 1.000000 0.206624\n", + "Fare 0.266100 -0.552893 0.093143 0.139860 0.206624 1.000000" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_data_corr = titanic_df.corr()\n", + "titanic_data_corr" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
    " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(12, 10))\n", + "sns.heatmap(titanic_data_corr, annot=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    SurvivedPclassSexAgeSibSpParchFareEmbarked
    003122.0107.2500S
    111038.01071.2833C
    213026.0007.9250S
    311035.01053.1000S
    403135.0008.0500S
    \n", + "
    " + ], + "text/plain": [ + " Survived Pclass Sex Age SibSp Parch Fare Embarked\n", + "0 0 3 1 22.0 1 0 7.2500 S\n", + "1 1 1 0 38.0 1 0 71.2833 C\n", + "2 1 3 0 26.0 0 0 7.9250 S\n", + "3 1 1 0 35.0 1 0 53.1000 S\n", + "4 0 3 1 35.0 0 0 8.0500 S" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn import preprocessing\n", + "\n", + "label_encoding = preprocessing.LabelEncoder()\n", + "titanic_df['Sex'] = label_encoding.fit_transform(titanic_df['Sex'].astype(str))\n", + "titanic_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['female', 'male'], dtype=object)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "label_encoding.classes_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### C = Cherbourg, Q = Queenstown, S = Southampton" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    SurvivedPclassSexAgeSibSpParchFareEmbarked_CEmbarked_QEmbarked_S
    003122.0107.2500001
    111038.01071.2833100
    213026.0007.9250001
    311035.01053.1000001
    403135.0008.0500001
    \n", + "
    " + ], + "text/plain": [ + " Survived Pclass Sex Age SibSp Parch Fare Embarked_C Embarked_Q \\\n", + "0 0 3 1 22.0 1 0 7.2500 0 0 \n", + "1 1 1 0 38.0 1 0 71.2833 1 0 \n", + "2 1 3 0 26.0 0 0 7.9250 0 0 \n", + "3 1 1 0 35.0 1 0 53.1000 0 0 \n", + "4 0 3 1 35.0 0 0 8.0500 0 0 \n", + "\n", + " Embarked_S \n", + "0 1 \n", + "1 0 \n", + "2 1 \n", + "3 1 \n", + "4 1 " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_df = pd.get_dummies(titanic_df, columns=['Embarked'])\n", + "titanic_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    SurvivedPclassSexAgeSibSpParchFareEmbarked_CEmbarked_QEmbarked_S
    003014.0007.8542001
    111128.00026.5500001
    211036.012120.0000001
    303117.0107.0542001
    40314.04231.2750001
    \n", + "
    " + ], + "text/plain": [ + " Survived Pclass Sex Age SibSp Parch Fare Embarked_C \\\n", + "0 0 3 0 14.0 0 0 7.8542 0 \n", + "1 1 1 1 28.0 0 0 26.5500 0 \n", + "2 1 1 0 36.0 1 2 120.0000 0 \n", + "3 0 3 1 17.0 1 0 7.0542 0 \n", + "4 0 3 1 4.0 4 2 31.2750 0 \n", + "\n", + " Embarked_Q Embarked_S \n", + "0 0 1 \n", + "1 0 1 \n", + "2 0 1 \n", + "3 0 1 \n", + "4 0 1 " + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_df = titanic_df.sample(frac=1).reset_index(drop=True)\n", + "titanic_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "titanic_df.to_csv('datasets/titanic_processed.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "fashion-mnist_train.csv titanic_processed.csv titanic_train.csv\r\n" + ] + } + ], + "source": [ + "!ls datasets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Machine Learning/02-BinaryClassification_LogisticRegression_Titanic-Copy1.ipynb b/Machine Learning/02-BinaryClassification_LogisticRegression_Titanic-Copy1.ipynb new file mode 100644 index 00000000..774410b9 --- /dev/null +++ b/Machine Learning/02-BinaryClassification_LogisticRegression_Titanic-Copy1.ipynb @@ -0,0 +1,834 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    SurvivedPclassSexAgeSibSpParchFareEmbarked_CEmbarked_QEmbarked_S
    003014.0007.8542001
    111128.00026.5500001
    211036.012120.0000001
    303117.0107.0542001
    40314.04231.2750001
    \n", + "
    " + ], + "text/plain": [ + " Survived Pclass Sex Age SibSp Parch Fare Embarked_C \\\n", + "0 0 3 0 14.0 0 0 7.8542 0 \n", + "1 1 1 1 28.0 0 0 26.5500 0 \n", + "2 1 1 0 36.0 1 2 120.0000 0 \n", + "3 0 3 1 17.0 1 0 7.0542 0 \n", + "4 0 3 1 4.0 4 2 31.2750 0 \n", + "\n", + " Embarked_Q Embarked_S \n", + "0 0 1 \n", + "1 0 1 \n", + "2 0 1 \n", + "3 0 1 \n", + "4 0 1 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_df = pd.read_csv('titanic_processed.csv')\n", + "\n", + "titanic_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(712, 10)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X = titanic_df.drop('Survived', axis=1)\n", + "Y = titanic_df['Survived']\n", + "\n", + "x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((569, 9), (569,))" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_train.shape, y_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((143, 9), (143,))" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_test.shape, y_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Logistic regression for classification\n", + "\n", + "https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(x_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = logistic_model.predict(x_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Confusion matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "pred_results = pd.DataFrame({'y_test': y_test,\n", + " 'y_pred': y_pred})" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    y_testy_pred
    67311
    25500
    64311
    30511
    62400
    \n", + "
    " + ], + "text/plain": [ + " y_test y_pred\n", + "673 1 1\n", + "255 0 0\n", + "643 1 1\n", + "305 1 1\n", + "624 0 0" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred_results.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    y_test01
    y_pred
    07019
    11143
    \n", + "
    " + ], + "text/plain": [ + "y_test 0 1\n", + "y_pred \n", + "0 70 19\n", + "1 11 43" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)\n", + "\n", + "titanic_crosstab" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Precision-recall scores\n", + "\n", + "When we use these for multiclass classification we need to specify an averaging method to determine how the precision and recall scores for different labels should be weighted\n", + "\n", + "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html\n", + "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import precision_score\n", + "from sklearn.metrics import recall_score" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "accuracy_score : 0.7902097902097902\n", + "precision_score : 0.7962962962962963\n", + "recall_score : 0.6935483870967742\n" + ] + } + ], + "source": [ + "acc = accuracy_score(y_test, y_pred)\n", + "prec = precision_score(y_test, y_pred)\n", + "recall = recall_score(y_test, y_pred)\n", + "\n", + "print(\"accuracy_score : \", acc)\n", + "print(\"precision_score : \", prec)\n", + "print(\"recall_score : \", recall)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    y_test01
    y_pred
    07019
    11143
    \n", + "
    " + ], + "text/plain": [ + "y_test 0 1\n", + "y_pred \n", + "0 70 19\n", + "1 11 43" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_crosstab" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "TP = titanic_crosstab[1][1]\n", + "TN = titanic_crosstab[0][0]\n", + "FP = titanic_crosstab[0][1]\n", + "FN = titanic_crosstab[1][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7902097902097902" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "accuracy_score_verified = (TP + TN) / (TP + FP + TN + FN)\n", + "\n", + "accuracy_score_verified" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7962962962962963" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "precision_score_survived = TP / (TP + FP)\n", + "\n", + "precision_score_survived" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6935483870967742" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recall_score_survived = TP / (TP + FN)\n", + "recall_score_survived" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
    " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.pairplot(titanic_df)\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.svm import LinearSVC" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LinearSVC(dual=False, tol=0.001)" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = LinearSVC(C=1.0, max_iter=1000, tol=1e-3, dual=False)\n", + "model.fit(x_train, y_train) \n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred_train = model.predict(x_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7943760984182777\n", + "0.7658536585365854\n", + "0.6946902654867256\n" + ] + } + ], + "source": [ + "print(accuracy_score(y_train, y_pred_train, normalize=True))\n", + "print(precision_score(y_train, y_pred_train))\n", + "print(recall_score(y_train, y_pred_train))" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = model.predict(x_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7762237762237763\n", + "0.7777777777777778\n", + "0.6774193548387096\n" + ] + } + ], + "source": [ + "print(accuracy_score(y_test, y_pred, normalize=True))\n", + "print(precision_score(y_test, y_pred))\n", + "print(recall_score(y_test, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    y_test01
    y_pred
    06920
    11242
    \n", + "
    " + ], + "text/plain": [ + "y_test 0 1\n", + "y_pred \n", + "0 69 20\n", + "1 12 42" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "pd.crosstab(pd.DataFrame({'y_test': y_test,'y_pred': y_pred}).y_pred, pred_results.y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Machine Learning/02-BinaryClassification_LogisticRegression_Titanic.ipynb b/Machine Learning/02-BinaryClassification_LogisticRegression_Titanic.ipynb new file mode 100644 index 00000000..3197cb2f --- /dev/null +++ b/Machine Learning/02-BinaryClassification_LogisticRegression_Titanic.ipynb @@ -0,0 +1,679 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    SurvivedPclassSexAgeSibSpParchFareEmbarked_CEmbarked_QEmbarked_S
    003014.0007.8542001
    111128.00026.5500001
    211036.012120.0000001
    303117.0107.0542001
    40314.04231.2750001
    \n", + "
    " + ], + "text/plain": [ + " Survived Pclass Sex Age SibSp Parch Fare Embarked_C \\\n", + "0 0 3 0 14.0 0 0 7.8542 0 \n", + "1 1 1 1 28.0 0 0 26.5500 0 \n", + "2 1 1 0 36.0 1 2 120.0000 0 \n", + "3 0 3 1 17.0 1 0 7.0542 0 \n", + "4 0 3 1 4.0 4 2 31.2750 0 \n", + "\n", + " Embarked_Q Embarked_S \n", + "0 0 1 \n", + "1 0 1 \n", + "2 0 1 \n", + "3 0 1 \n", + "4 0 1 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_df = pd.read_csv('titanic_processed.csv')\n", + "\n", + "titanic_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(712, 10)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X = titanic_df.drop('Survived', axis=1)\n", + "Y = titanic_df['Survived']\n", + "\n", + "x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((569, 9), (569,))" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_train.shape, y_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((143, 9), (143,))" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_test.shape, y_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Logistic regression for classification\n", + "\n", + "https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(x_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = logistic_model.predict(x_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Confusion matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "pred_results = pd.DataFrame({'y_test': y_test,\n", + " 'y_pred': y_pred})" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    y_testy_pred
    23801
    36711
    20100
    20000
    63300
    \n", + "
    " + ], + "text/plain": [ + " y_test y_pred\n", + "238 0 1\n", + "367 1 1\n", + "201 0 0\n", + "200 0 0\n", + "633 0 0" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred_results.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    y_test01
    y_pred
    08316
    11133
    \n", + "
    " + ], + "text/plain": [ + "y_test 0 1\n", + "y_pred \n", + "0 83 16\n", + "1 11 33" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)\n", + "\n", + "titanic_crosstab" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Precision-recall scores\n", + "\n", + "When we use these for multiclass classification we need to specify an averaging method to determine how the precision and recall scores for different labels should be weighted\n", + "\n", + "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html\n", + "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import precision_score\n", + "from sklearn.metrics import recall_score" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "accuracy_score : 0.8111888111888111\n", + "precision_score : 0.75\n", + "recall_score : 0.673469387755102\n" + ] + } + ], + "source": [ + "acc = accuracy_score(y_test, y_pred)\n", + "prec = precision_score(y_test, y_pred)\n", + "recall = recall_score(y_test, y_pred)\n", + "\n", + "print(\"accuracy_score : \", acc)\n", + "print(\"precision_score : \", prec)\n", + "print(\"recall_score : \", recall)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    y_test01
    y_pred
    08316
    11133
    \n", + "
    " + ], + "text/plain": [ + "y_test 0 1\n", + "y_pred \n", + "0 83 16\n", + "1 11 33" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_crosstab" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "TP = titanic_crosstab[1][1]\n", + "TN = titanic_crosstab[0][0]\n", + "FP = titanic_crosstab[0][1]\n", + "FN = titanic_crosstab[1][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8111888111888111" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "accuracy_score_verified = (TP + TN) / (TP + FP + TN + FN)\n", + "\n", + "accuracy_score_verified" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.75" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "precision_score_survived = TP / (TP + FP)\n", + "\n", + "precision_score_survived" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.673469387755102" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recall_score_survived = TP / (TP + FN)\n", + "recall_score_survived" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
    " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.pairplot(titanic_df)\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Machine Learning/03-MultipleClassificationModels_Titanic.ipynb b/Machine Learning/03-MultipleClassificationModels_Titanic.ipynb new file mode 100644 index 00000000..6f1b784a --- /dev/null +++ b/Machine Learning/03-MultipleClassificationModels_Titanic.ipynb @@ -0,0 +1,1171 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import precision_score\n", + "from sklearn.metrics import recall_score\n", + "\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", + "from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\n", + "from sklearn.linear_model import SGDClassifier\n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.neighbors import RadiusNeighborsClassifier\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.tree import DecisionTreeClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    SurvivedPclassSexAgeSibSpParchFareEmbarked_CEmbarked_QEmbarked_S
    003014.0007.8542001
    111128.00026.5500001
    211036.012120.0000001
    303117.0107.0542001
    40314.04231.2750001
    \n", + "
    " + ], + "text/plain": [ + " Survived Pclass Sex Age SibSp Parch Fare Embarked_C \\\n", + "0 0 3 0 14.0 0 0 7.8542 0 \n", + "1 1 1 1 28.0 0 0 26.5500 0 \n", + "2 1 1 0 36.0 1 2 120.0000 0 \n", + "3 0 3 1 17.0 1 0 7.0542 0 \n", + "4 0 3 1 4.0 4 2 31.2750 0 \n", + "\n", + " Embarked_Q Embarked_S \n", + "0 0 1 \n", + "1 0 1 \n", + "2 0 1 \n", + "3 0 1 \n", + "4 0 1 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_df = pd.read_csv('datasets/titanic_processed.csv')\n", + "\n", + "titanic_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Pclass',\n", + " 'Sex',\n", + " 'Age',\n", + " 'SibSp',\n", + " 'Parch',\n", + " 'Fare',\n", + " 'Embarked_C',\n", + " 'Embarked_Q',\n", + " 'Embarked_S']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "FEATURES = list(titanic_df.columns[1:])\n", + "\n", + "FEATURES" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "result_dict = {}" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def summarize_classification(y_test, y_pred):\n", + " \n", + " acc = accuracy_score(y_test, y_pred, normalize=True)\n", + " num_acc = accuracy_score(y_test, y_pred, normalize=False)\n", + "\n", + " prec = precision_score(y_test, y_pred)\n", + " recall = recall_score(y_test, y_pred)\n", + " \n", + " return {'accuracy': acc, \n", + " 'precision': prec,\n", + " 'recall':recall, \n", + " 'accuracy_count':num_acc}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def build_model(classifier_fn, \n", + " name_of_y_col, \n", + " names_of_x_cols, \n", + " dataset, \n", + " test_frac=0.2):\n", + " \n", + " X = dataset[names_of_x_cols]\n", + " Y = dataset[name_of_y_col]\n", + "\n", + " x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)\n", + " \n", + " model = classifier_fn(x_train, y_train)\n", + " \n", + " y_pred = model.predict(x_test)\n", + "\n", + " y_pred_train = model.predict(x_train)\n", + " \n", + " train_summary = summarize_classification(y_train, y_pred_train)\n", + " test_summary = summarize_classification(y_test, y_pred)\n", + " \n", + " pred_results = pd.DataFrame({'y_test': y_test,\n", + " 'y_pred': y_pred})\n", + " \n", + " model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)\n", + " \n", + " return {'training': train_summary, \n", + " 'test': test_summary,\n", + " 'confusion_matrix': model_crosstab}" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def compare_results():\n", + " for key in result_dict:\n", + " print('Classification: ', key)\n", + "\n", + " print()\n", + " print('Training data')\n", + " for score in result_dict[key]['training']:\n", + " print(score, result_dict[key]['training'][score])\n", + "\n", + " print()\n", + " print('Test data')\n", + " for score in result_dict[key]['test']:\n", + " print(score, result_dict[key]['test'][score])\n", + " \n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def logistic_fn(x_train, y_train):\n", + " \n", + " model = LogisticRegression(solver='liblinear')\n", + " model.fit(x_train, y_train)\n", + " \n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Classification: survived ~ logistic\n", + "\n", + "Training data\n", + "accuracy 0.7908611599297012\n", + "precision 0.7729468599033816\n", + "recall 0.6896551724137931\n", + "accuracy_count 450\n", + "\n", + "Test data\n", + "accuracy 0.7902097902097902\n", + "precision 0.7407407407407407\n", + "recall 0.7142857142857143\n", + "accuracy_count 113\n", + "\n" + ] + } + ], + "source": [ + "result_dict['survived ~ logistic'] = build_model(logistic_fn,\n", + " 'Survived',\n", + " FEATURES,\n", + " titanic_df)\n", + "\n", + "compare_results()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def linear_discriminant_fn(x_train, y_train, solver='svd'):\n", + " \n", + " model = LinearDiscriminantAnalysis(solver=solver)\n", + " model.fit(x_train, y_train)\n", + " \n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Classification: survived ~ logistic\n", + "\n", + "Training data\n", + "accuracy 0.7908611599297012\n", + "precision 0.7729468599033816\n", + "recall 0.6896551724137931\n", + "accuracy_count 450\n", + "\n", + "Test data\n", + "accuracy 0.7902097902097902\n", + "precision 0.7407407407407407\n", + "recall 0.7142857142857143\n", + "accuracy_count 113\n", + "\n", + "Classification: survived ~ linear_discriminant_analysis\n", + "\n", + "Training data\n", + "accuracy 0.7961335676625659\n", + "precision 0.7752293577981652\n", + "recall 0.7161016949152542\n", + "accuracy_count 453\n", + "\n", + "Test data\n", + "accuracy 0.7692307692307693\n", + "precision 0.7209302325581395\n", + "recall 0.5961538461538461\n", + "accuracy_count 110\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/lib/python3.7/site-packages/sklearn/discriminant_analysis.py:388: UserWarning: Variables are collinear.\n", + " warnings.warn(\"Variables are collinear.\")\n" + ] + } + ], + "source": [ + "result_dict['survived ~ linear_discriminant_analysis'] = build_model(linear_discriminant_fn,\n", + " 'Survived',\n", + " FEATURES,\n", + " titanic_df)\n", + "compare_results()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Classification: survived ~ logistic\n", + "\n", + "Training data\n", + "accuracy 0.7908611599297012\n", + "precision 0.7729468599033816\n", + "recall 0.6896551724137931\n", + "accuracy_count 450\n", + "\n", + "Test data\n", + "accuracy 0.7902097902097902\n", + "precision 0.7407407407407407\n", + "recall 0.7142857142857143\n", + "accuracy_count 113\n", + "\n", + "Classification: survived ~ linear_discriminant_analysis\n", + "\n", + "Training data\n", + "accuracy 0.7961335676625659\n", + "precision 0.7560975609756098\n", + "recall 0.7013574660633484\n", + "accuracy_count 453\n", + "\n", + "Test data\n", + "accuracy 0.7832167832167832\n", + "precision 0.86\n", + "recall 0.6417910447761194\n", + "accuracy_count 112\n", + "\n" + ] + } + ], + "source": [ + "result_dict['survived ~ linear_discriminant_analysis'] = build_model(linear_discriminant_fn,\n", + " 'Survived',\n", + " FEATURES[0:-1],\n", + " titanic_df)\n", + "compare_results()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def quadratic_discriminant_fn(x_train, y_train):\n", + " \n", + " model = QuadraticDiscriminantAnalysis()\n", + " model.fit(x_train, y_train)\n", + " \n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Classification: survived ~ logistic\n", + "\n", + "Training data\n", + "accuracy 0.7908611599297012\n", + "precision 0.7729468599033816\n", + "recall 0.6896551724137931\n", + "accuracy_count 450\n", + "\n", + "Test data\n", + "accuracy 0.7902097902097902\n", + "precision 0.7407407407407407\n", + "recall 0.7142857142857143\n", + "accuracy_count 113\n", + "\n", + "Classification: survived ~ linear_discriminant_analysis\n", + "\n", + "Training data\n", + "accuracy 0.7961335676625659\n", + "precision 0.7560975609756098\n", + "recall 0.7013574660633484\n", + "accuracy_count 453\n", + "\n", + "Test data\n", + "accuracy 0.7832167832167832\n", + "precision 0.86\n", + "recall 0.6417910447761194\n", + "accuracy_count 112\n", + "\n", + "Classification: survived ~ quadratic_discriminant_analysis\n", + "\n", + "Training data\n", + "accuracy 0.81195079086116\n", + "precision 0.7741935483870968\n", + "recall 0.7433628318584071\n", + "accuracy_count 462\n", + "\n", + "Test data\n", + "accuracy 0.7832167832167832\n", + "precision 0.7719298245614035\n", + "recall 0.7096774193548387\n", + "accuracy_count 112\n", + "\n" + ] + } + ], + "source": [ + "result_dict['survived ~ quadratic_discriminant_analysis'] = build_model(quadratic_discriminant_fn,\n", + " 'Survived',\n", + " FEATURES[0:-1],\n", + " titanic_df)\n", + "\n", + "compare_results()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def sgd_fn(x_train, y_train, max_iter=1000, tol=1e-3):\n", + " \n", + " model = SGDClassifier(max_iter=max_iter, tol=tol)\n", + " model.fit(x_train, y_train)\n", + " \n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Classification: survived ~ logistic\n", + "\n", + "Training data\n", + "accuracy 0.7908611599297012\n", + "precision 0.7729468599033816\n", + "recall 0.6896551724137931\n", + "accuracy_count 450\n", + "\n", + "Test data\n", + "accuracy 0.7902097902097902\n", + "precision 0.7407407407407407\n", + "recall 0.7142857142857143\n", + "accuracy_count 113\n", + "\n", + "Classification: survived ~ linear_discriminant_analysis\n", + "\n", + "Training data\n", + "accuracy 0.7961335676625659\n", + "precision 0.7560975609756098\n", + "recall 0.7013574660633484\n", + "accuracy_count 453\n", + "\n", + "Test data\n", + "accuracy 0.7832167832167832\n", + "precision 0.86\n", + "recall 0.6417910447761194\n", + "accuracy_count 112\n", + "\n", + "Classification: survived ~ quadratic_discriminant_analysis\n", + "\n", + "Training data\n", + "accuracy 0.81195079086116\n", + "precision 0.7741935483870968\n", + "recall 0.7433628318584071\n", + "accuracy_count 462\n", + "\n", + "Test data\n", + "accuracy 0.7832167832167832\n", + "precision 0.7719298245614035\n", + "recall 0.7096774193548387\n", + "accuracy_count 112\n", + "\n", + "Classification: survived ~ sgd\n", + "\n", + "Training data\n", + "accuracy 0.7363796133567663\n", + "precision 0.7547169811320755\n", + "recall 0.5194805194805194\n", + "accuracy_count 419\n", + "\n", + "Test data\n", + "accuracy 0.7482517482517482\n", + "precision 0.7441860465116279\n", + "recall 0.5614035087719298\n", + "accuracy_count 107\n", + "\n" + ] + } + ], + "source": [ + "result_dict['survived ~ sgd'] = build_model(sgd_fn,\n", + " 'Survived',\n", + " FEATURES,\n", + " titanic_df)\n", + "\n", + "compare_results()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### LinearSVC\n", + "\n", + "https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html\n", + "\n", + "* SVC with a linear kernel\n", + "* dual=False when number of samples > number of features" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def linear_svc_fn(x_train, y_train, C=1.0, max_iter=1000, tol=1e-3):\n", + " \n", + " model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)\n", + " model.fit(x_train, y_train) \n", + " \n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Classification: survived ~ logistic\n", + "\n", + "Training data\n", + "accuracy 0.7908611599297012\n", + "precision 0.7729468599033816\n", + "recall 0.6896551724137931\n", + "accuracy_count 450\n", + "\n", + "Test data\n", + "accuracy 0.7902097902097902\n", + "precision 0.7407407407407407\n", + "recall 0.7142857142857143\n", + "accuracy_count 113\n", + "\n", + "Classification: survived ~ linear_discriminant_analysis\n", + "\n", + "Training data\n", + "accuracy 0.7961335676625659\n", + "precision 0.7560975609756098\n", + "recall 0.7013574660633484\n", + "accuracy_count 453\n", + "\n", + "Test data\n", + "accuracy 0.7832167832167832\n", + "precision 0.86\n", + "recall 0.6417910447761194\n", + "accuracy_count 112\n", + "\n", + "Classification: survived ~ quadratic_discriminant_analysis\n", + "\n", + "Training data\n", + "accuracy 0.81195079086116\n", + "precision 0.7741935483870968\n", + "recall 0.7433628318584071\n", + "accuracy_count 462\n", + "\n", + "Test data\n", + "accuracy 0.7832167832167832\n", + "precision 0.7719298245614035\n", + "recall 0.7096774193548387\n", + "accuracy_count 112\n", + "\n", + "Classification: survived ~ sgd\n", + "\n", + "Training data\n", + "accuracy 0.7363796133567663\n", + "precision 0.7547169811320755\n", + "recall 0.5194805194805194\n", + "accuracy_count 419\n", + "\n", + "Test data\n", + "accuracy 0.7482517482517482\n", + "precision 0.7441860465116279\n", + "recall 0.5614035087719298\n", + "accuracy_count 107\n", + "\n", + "Classification: survived ~ linear_svc\n", + "\n", + "Training data\n", + "accuracy 0.789103690685413\n", + "precision 0.7652582159624414\n", + "recall 0.6995708154506438\n", + "accuracy_count 449\n", + "\n", + "Test data\n", + "accuracy 0.8531468531468531\n", + "precision 0.84\n", + "recall 0.7636363636363637\n", + "accuracy_count 122\n", + "\n" + ] + } + ], + "source": [ + "result_dict['survived ~ linear_svc'] = build_model(linear_svc_fn,\n", + " 'Survived',\n", + " FEATURES,\n", + " titanic_df)\n", + "\n", + "compare_results()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "def radius_neighbor_fn(x_train, y_train, radius=40.0):\n", + "\n", + " model = RadiusNeighborsClassifier(radius=radius)\n", + " model.fit(x_train, y_train) \n", + " \n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Classification: survived ~ logistic\n", + "\n", + "Training data\n", + "accuracy 0.7908611599297012\n", + "precision 0.7729468599033816\n", + "recall 0.6896551724137931\n", + "accuracy_count 450\n", + "\n", + "Test data\n", + "accuracy 0.7902097902097902\n", + "precision 0.7407407407407407\n", + "recall 0.7142857142857143\n", + "accuracy_count 113\n", + "\n", + "Classification: survived ~ linear_discriminant_analysis\n", + "\n", + "Training data\n", + "accuracy 0.7961335676625659\n", + "precision 0.7560975609756098\n", + "recall 0.7013574660633484\n", + "accuracy_count 453\n", + "\n", + "Test data\n", + "accuracy 0.7832167832167832\n", + "precision 0.86\n", + "recall 0.6417910447761194\n", + "accuracy_count 112\n", + "\n", + "Classification: survived ~ quadratic_discriminant_analysis\n", + "\n", + "Training data\n", + "accuracy 0.81195079086116\n", + "precision 0.7741935483870968\n", + "recall 0.7433628318584071\n", + "accuracy_count 462\n", + "\n", + "Test data\n", + "accuracy 0.7832167832167832\n", + "precision 0.7719298245614035\n", + "recall 0.7096774193548387\n", + "accuracy_count 112\n", + "\n", + "Classification: survived ~ sgd\n", + "\n", + "Training data\n", + "accuracy 0.7363796133567663\n", + "precision 0.7547169811320755\n", + "recall 0.5194805194805194\n", + "accuracy_count 419\n", + "\n", + "Test data\n", + "accuracy 0.7482517482517482\n", + "precision 0.7441860465116279\n", + "recall 0.5614035087719298\n", + "accuracy_count 107\n", + "\n", + "Classification: survived ~ linear_svc\n", + "\n", + "Training data\n", + "accuracy 0.789103690685413\n", + "precision 0.7652582159624414\n", + "recall 0.6995708154506438\n", + "accuracy_count 449\n", + "\n", + "Test data\n", + "accuracy 0.8531468531468531\n", + "precision 0.84\n", + "recall 0.7636363636363637\n", + "accuracy_count 122\n", + "\n", + "Classification: survived ~ radius_neighbors\n", + "\n", + "Training data\n", + "accuracy 0.6590509666080844\n", + "precision 0.6931818181818182\n", + "recall 0.2675438596491228\n", + "accuracy_count 375\n", + "\n", + "Test data\n", + "accuracy 0.6853146853146853\n", + "precision 0.8571428571428571\n", + "recall 0.3\n", + "accuracy_count 98\n", + "\n" + ] + } + ], + "source": [ + "result_dict['survived ~ radius_neighbors'] = build_model(radius_neighbor_fn,\n", + " 'Survived',\n", + " FEATURES,\n", + " titanic_df)\n", + "compare_results()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "max_depth = None [ If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples ]\n", + "\n", + "max_features = None [None -- max_features=n_features, \n", + " auto -- then max_features=sqrt(n_features), \n", + " sqrt -- then max_features=sqrt(n_features), \n", + " log2 -- then max_features=log2(n_features)]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "def decision_tree_fn(x_train, y_train, max_depth=None, max_features=None): \n", + " \n", + " model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)\n", + " model.fit(x_train, y_train)\n", + " \n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Classification: survived ~ logistic\n", + "\n", + "Training data\n", + "accuracy 0.7908611599297012\n", + "precision 0.7729468599033816\n", + "recall 0.6896551724137931\n", + "accuracy_count 450\n", + "\n", + "Test data\n", + "accuracy 0.7902097902097902\n", + "precision 0.7407407407407407\n", + "recall 0.7142857142857143\n", + "accuracy_count 113\n", + "\n", + "Classification: survived ~ linear_discriminant_analysis\n", + "\n", + "Training data\n", + "accuracy 0.7961335676625659\n", + "precision 0.7560975609756098\n", + "recall 0.7013574660633484\n", + "accuracy_count 453\n", + "\n", + "Test data\n", + "accuracy 0.7832167832167832\n", + "precision 0.86\n", + "recall 0.6417910447761194\n", + "accuracy_count 112\n", + "\n", + "Classification: survived ~ quadratic_discriminant_analysis\n", + "\n", + "Training data\n", + "accuracy 0.81195079086116\n", + "precision 0.7741935483870968\n", + "recall 0.7433628318584071\n", + "accuracy_count 462\n", + "\n", + "Test data\n", + "accuracy 0.7832167832167832\n", + "precision 0.7719298245614035\n", + "recall 0.7096774193548387\n", + "accuracy_count 112\n", + "\n", + "Classification: survived ~ sgd\n", + "\n", + "Training data\n", + "accuracy 0.7363796133567663\n", + "precision 0.7547169811320755\n", + "recall 0.5194805194805194\n", + "accuracy_count 419\n", + "\n", + "Test data\n", + "accuracy 0.7482517482517482\n", + "precision 0.7441860465116279\n", + "recall 0.5614035087719298\n", + "accuracy_count 107\n", + "\n", + "Classification: survived ~ linear_svc\n", + "\n", + "Training data\n", + "accuracy 0.789103690685413\n", + "precision 0.7652582159624414\n", + "recall 0.6995708154506438\n", + "accuracy_count 449\n", + "\n", + "Test data\n", + "accuracy 0.8531468531468531\n", + "precision 0.84\n", + "recall 0.7636363636363637\n", + "accuracy_count 122\n", + "\n", + "Classification: survived ~ radius_neighbors\n", + "\n", + "Training data\n", + "accuracy 0.6590509666080844\n", + "precision 0.6931818181818182\n", + "recall 0.2675438596491228\n", + "accuracy_count 375\n", + "\n", + "Test data\n", + "accuracy 0.6853146853146853\n", + "precision 0.8571428571428571\n", + "recall 0.3\n", + "accuracy_count 98\n", + "\n", + "Classification: survived ~ decision_tree\n", + "\n", + "Training data\n", + "accuracy 0.9859402460456942\n", + "precision 1.0\n", + "recall 0.9644444444444444\n", + "accuracy_count 561\n", + "\n", + "Test data\n", + "accuracy 0.7132867132867133\n", + "precision 0.6896551724137931\n", + "recall 0.6349206349206349\n", + "accuracy_count 102\n", + "\n" + ] + } + ], + "source": [ + "result_dict['survived ~ decision_tree'] = build_model(decision_tree_fn,\n", + " 'Survived',\n", + " FEATURES,\n", + " titanic_df)\n", + "\n", + "compare_results()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "def naive_bayes_fn(x_train,y_train, priors=None):\n", + " \n", + " model = GaussianNB(priors=priors)\n", + " model.fit(x_train, y_train)\n", + " \n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Classification: survived ~ logistic\n", + "\n", + "Training data\n", + "accuracy 0.7908611599297012\n", + "precision 0.7729468599033816\n", + "recall 0.6896551724137931\n", + "accuracy_count 450\n", + "\n", + "Test data\n", + "accuracy 0.7902097902097902\n", + "precision 0.7407407407407407\n", + "recall 0.7142857142857143\n", + "accuracy_count 113\n", + "\n", + "Classification: survived ~ linear_discriminant_analysis\n", + "\n", + "Training data\n", + "accuracy 0.7961335676625659\n", + "precision 0.7560975609756098\n", + "recall 0.7013574660633484\n", + "accuracy_count 453\n", + "\n", + "Test data\n", + "accuracy 0.7832167832167832\n", + "precision 0.86\n", + "recall 0.6417910447761194\n", + "accuracy_count 112\n", + "\n", + "Classification: survived ~ quadratic_discriminant_analysis\n", + "\n", + "Training data\n", + "accuracy 0.81195079086116\n", + "precision 0.7741935483870968\n", + "recall 0.7433628318584071\n", + "accuracy_count 462\n", + "\n", + "Test data\n", + "accuracy 0.7832167832167832\n", + "precision 0.7719298245614035\n", + "recall 0.7096774193548387\n", + "accuracy_count 112\n", + "\n", + "Classification: survived ~ sgd\n", + "\n", + "Training data\n", + "accuracy 0.7363796133567663\n", + "precision 0.7547169811320755\n", + "recall 0.5194805194805194\n", + "accuracy_count 419\n", + "\n", + "Test data\n", + "accuracy 0.7482517482517482\n", + "precision 0.7441860465116279\n", + "recall 0.5614035087719298\n", + "accuracy_count 107\n", + "\n", + "Classification: survived ~ linear_svc\n", + "\n", + "Training data\n", + "accuracy 0.789103690685413\n", + "precision 0.7652582159624414\n", + "recall 0.6995708154506438\n", + "accuracy_count 449\n", + "\n", + "Test data\n", + "accuracy 0.8531468531468531\n", + "precision 0.84\n", + "recall 0.7636363636363637\n", + "accuracy_count 122\n", + "\n", + "Classification: survived ~ radius_neighbors\n", + "\n", + "Training data\n", + "accuracy 0.6590509666080844\n", + "precision 0.6931818181818182\n", + "recall 0.2675438596491228\n", + "accuracy_count 375\n", + "\n", + "Test data\n", + "accuracy 0.6853146853146853\n", + "precision 0.8571428571428571\n", + "recall 0.3\n", + "accuracy_count 98\n", + "\n", + "Classification: survived ~ decision_tree\n", + "\n", + "Training data\n", + "accuracy 0.9859402460456942\n", + "precision 1.0\n", + "recall 0.9644444444444444\n", + "accuracy_count 561\n", + "\n", + "Test data\n", + "accuracy 0.7132867132867133\n", + "precision 0.6896551724137931\n", + "recall 0.6349206349206349\n", + "accuracy_count 102\n", + "\n", + "Classification: survived ~ naive_bayes\n", + "\n", + "Training data\n", + "accuracy 0.7680140597539543\n", + "precision 0.7021276595744681\n", + "recall 0.7268722466960352\n", + "accuracy_count 437\n", + "\n", + "Test data\n", + "accuracy 0.7902097902097902\n", + "precision 0.7540983606557377\n", + "recall 0.7540983606557377\n", + "accuracy_count 113\n", + "\n" + ] + } + ], + "source": [ + "result_dict['survived ~ naive_bayes'] = build_model(naive_bayes_fn,\n", + " 'Survived',\n", + " FEATURES,\n", + " titanic_df)\n", + "\n", + "compare_results()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Machine Learning/04-HyperparameterTuningWithGridSearch.ipynb b/Machine Learning/04-HyperparameterTuningWithGridSearch.ipynb new file mode 100644 index 00000000..7323b692 --- /dev/null +++ b/Machine Learning/04-HyperparameterTuningWithGridSearch.ipynb @@ -0,0 +1,485 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import precision_score\n", + "from sklearn.metrics import recall_score\n", + "\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.tree import DecisionTreeClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    SurvivedPclassSexAgeSibSpParchFareEmbarked_CEmbarked_QEmbarked_S
    003014.0007.8542001
    111128.00026.5500001
    211036.012120.0000001
    303117.0107.0542001
    40314.04231.2750001
    \n", + "
    " + ], + "text/plain": [ + " Survived Pclass Sex Age SibSp Parch Fare Embarked_C \\\n", + "0 0 3 0 14.0 0 0 7.8542 0 \n", + "1 1 1 1 28.0 0 0 26.5500 0 \n", + "2 1 1 0 36.0 1 2 120.0000 0 \n", + "3 0 3 1 17.0 1 0 7.0542 0 \n", + "4 0 3 1 4.0 4 2 31.2750 0 \n", + "\n", + " Embarked_Q Embarked_S \n", + "0 0 1 \n", + "1 0 1 \n", + "2 0 1 \n", + "3 0 1 \n", + "4 0 1 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "titanic_df = pd.read_csv('datasets/titanic_processed.csv')\n", + "\n", + "titanic_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "X = titanic_df.drop('Survived', axis=1)\n", + "\n", + "Y = titanic_df['Survived']\n", + "\n", + "x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def summarize_classification(y_test, y_pred):\n", + " \n", + " acc = accuracy_score(y_test, y_pred, normalize=True)\n", + " num_acc = accuracy_score(y_test, y_pred, normalize=False)\n", + "\n", + " prec = precision_score(y_test, y_pred)\n", + " recall = recall_score(y_test, y_pred)\n", + " \n", + " print(\"Test data count: \",len(y_test))\n", + " print(\"accuracy_count : \" , num_acc)\n", + " print(\"accuracy_score : \" , acc)\n", + " print(\"precision_score : \" , prec)\n", + " print(\"recall_score : \", recall)\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'max_depth': 5}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "\n", + "parameters = {'max_depth': [2, 4, 5, 7, 9, 10]}\n", + "\n", + "grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True)\n", + "grid_search.fit(x_train, y_train)\n", + "\n", + "grid_search.best_params_" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parameters: {'max_depth': 2}\n", + "Mean Test Score: 0.7609841827768014\n", + "Rank: 5\n", + "Parameters: {'max_depth': 4}\n", + "Mean Test Score: 0.7644991212653779\n", + "Rank: 4\n", + "Parameters: {'max_depth': 5}\n", + "Mean Test Score: 0.7855887521968365\n", + "Rank: 1\n", + "Parameters: {'max_depth': 7}\n", + "Mean Test Score: 0.7803163444639719\n", + "Rank: 2\n", + "Parameters: {'max_depth': 9}\n", + "Mean Test Score: 0.7662565905096661\n", + "Rank: 3\n", + "Parameters: {'max_depth': 10}\n", + "Mean Test Score: 0.7557117750439367\n", + "Rank: 6\n" + ] + } + ], + "source": [ + "for i in range(6):\n", + " print('Parameters: ', grid_search.cv_results_['params'][i])\n", + "\n", + " print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])\n", + " \n", + " print('Rank: ', grid_search.cv_results_['rank_test_score'][i])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "decision_tree_model = DecisionTreeClassifier( \\\n", + " max_depth = grid_search.best_params_['max_depth']).fit(x_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = decision_tree_model.predict(x_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test data count: 143\n", + "accuracy_count : 117\n", + "accuracy_score : 0.8181818181818182\n", + "precision_score : 0.7894736842105263\n", + "recall_score : 0.7627118644067796\n", + "\n" + ] + } + ], + "source": [ + "summarize_classification(y_test, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'C': 0.4, 'penalty': 'l1'}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "parameters = {'penalty': ['l1', 'l2'], \n", + " 'C': [0.1, 0.4, 0.8, 1, 2, 5]}\n", + "\n", + "grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv=3, return_train_score=True)\n", + "grid_search.fit(x_train, y_train)\n", + "\n", + "grid_search.best_params_" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parameters: {'C': 0.1, 'penalty': 'l1'}\n", + "Mean Test Score: 0.7451669595782073\n", + "Rank: 12\n", + "Parameters: {'C': 0.1, 'penalty': 'l2'}\n", + "Mean Test Score: 0.7539543057996485\n", + "Rank: 11\n", + "Parameters: {'C': 0.4, 'penalty': 'l1'}\n", + "Mean Test Score: 0.7803163444639719\n", + "Rank: 1\n", + "Parameters: {'C': 0.4, 'penalty': 'l2'}\n", + "Mean Test Score: 0.7750439367311072\n", + "Rank: 8\n", + "Parameters: {'C': 0.8, 'penalty': 'l1'}\n", + "Mean Test Score: 0.7785588752196837\n", + "Rank: 4\n", + "Parameters: {'C': 0.8, 'penalty': 'l2'}\n", + "Mean Test Score: 0.7785588752196837\n", + "Rank: 4\n", + "Parameters: {'C': 1, 'penalty': 'l1'}\n", + "Mean Test Score: 0.7750439367311072\n", + "Rank: 8\n", + "Parameters: {'C': 1, 'penalty': 'l2'}\n", + "Mean Test Score: 0.7768014059753954\n", + "Rank: 7\n", + "Parameters: {'C': 2, 'penalty': 'l1'}\n", + "Mean Test Score: 0.7750439367311072\n", + "Rank: 8\n", + "Parameters: {'C': 2, 'penalty': 'l2'}\n", + "Mean Test Score: 0.7803163444639719\n", + "Rank: 1\n", + "Parameters: {'C': 5, 'penalty': 'l1'}\n", + "Mean Test Score: 0.7803163444639719\n", + "Rank: 1\n", + "Parameters: {'C': 5, 'penalty': 'l2'}\n", + "Mean Test Score: 0.7785588752196837\n", + "Rank: 4\n" + ] + } + ], + "source": [ + "for i in range(12):\n", + " print('Parameters: ', grid_search.cv_results_['params'][i])\n", + " print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])\n", + " print('Rank: ', grid_search.cv_results_['rank_test_score'][i])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "logistic_model = LogisticRegression(solver='liblinear', \\\n", + " penalty=grid_search.best_params_['penalty'], C=grid_search.best_params_['C']). \\\n", + " fit(x_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = logistic_model.predict(x_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test data count: 143\n", + "accuracy_count : 116\n", + "accuracy_score : 0.8111888111888111\n", + "precision_score : 0.7962962962962963\n", + "recall_score : 0.7288135593220338\n", + "\n" + ] + } + ], + "source": [ + "summarize_classification(y_test, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}