diff --git a/caim.py b/caim.py index 385edad..5c9ee25 100644 --- a/caim.py +++ b/caim.py @@ -80,7 +80,7 @@ def _run_feature(feature_series, class_series): # Starting interval is end to end set disc_interval = np.array([remaining_int[0], remaining_int[-1]]) remaining_int = remaining_int[1:-1] - f = lambda x: CAIM.compute_caim(CAIM.build_quanta(input_data, x, feature_name, class_name)) + f = lambda x: CAIM.compute_ur_caim(CAIM.build_quanta(input_data, x, feature_name, class_name)) global_caim = 0 while not done: @@ -149,6 +149,46 @@ def compute_caim(quanta): return ((max_r**2/m_r).sum()/n).values[0] #return pd.eval(((max_r**2/m_r).sum()/n)) + @staticmethod + def compute_ur_caim(quanta): + # Get the M+r value (number of values in the interval for all classes) + m_r = quanta.sum(axis=0, level=0) + + # Get the Mi+ value (number of values in all the intervals for the i's class) + mi_ = quanta.sum(axis=1, level=0) + + # Get Max_r (maximum class count for the bin) + max_r = quanta.max(axis=0, level=0) + + # This will only count up the number of bins + # that have some items + m = m_r.sum() + + pi_ = mi_/m + p_r = m_r/m + pir = quanta/m + + # CAIM normalised by the number of continuous values + caim_n = ((max_r**2/m_r).sum()/m).values[0] + + # class attribute information + info = ((p_r/pi_).applymap(np.log2)*pir).sum(axis=0, level=0).sum().values[0] + + # entropy + h = ((1/pir).applymap(np.log2)*pir).sum(axis=0, level=0).sum().values[0] + + # mutual information modified + mi = ((pir/(pi_*p_r)).applymap(np.log2)*(1-pi_)).sum(axis=0, level=0).sum().values[0] + + # class attribute interdependence redundancy + cair = mi/h + + # class attribute interdependence uncertainty + caiu = info/h + + return caim_n*cair*(1-caiu) + + def fit_parallel(self, X, Y, n_jobs=8): self._create_init_data(X, Y)