diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py index ca85519b28..ba4b1da86a 100644 --- a/machine_learning_hep/analysis/analyzer_jets.py +++ b/machine_learning_hep/analysis/analyzer_jets.py @@ -62,23 +62,13 @@ def __init__(self, datap, case, typean, period): super().__init__(datap, case, typean, period) # output directories - self.d_resultsallpmc = self.cfg(f"mc.results.{period}") if period is not None else self.cfg("mc.resultsallp") - self.d_resultsallpdata = ( - self.cfg(f"data.results.{period}") if period is not None else self.cfg("data.resultsallp") - ) + suffix = f"results.{period}" if period is not None else "resultsallp" + self.d_resultsallpmc = self.cfg(f"mc.{suffix}") + self.d_resultsallpdata = self.cfg(f"data.{suffix}") # input directories (processor output) - self.d_resultsallpmc_proc = self.d_resultsallpmc - self.d_resultsallpdata_proc = self.d_resultsallpdata - # use a different processor output - if "data_proc" in datap["analysis"][typean]: - self.d_resultsallpdata_proc = ( - self.cfg(f"data_proc.results.{period}") if period is not None else self.cfg("data_proc.resultsallp") - ) - if "mc_proc" in datap["analysis"][typean]: - self.d_resultsallpmc_proc = ( - self.cfg(f"mc_proc.results.{period}") if period is not None else self.cfg("mc_proc.resultsallp") - ) + self.d_resultsallpdata_proc = self.cfg(f"data_proc.{suffix}") + self.d_resultsallpmc_proc = self.cfg(f"mc_proc.{suffix}") # input files n_filemass_name = datap["files_names"]["histofilename"] @@ -93,6 +83,7 @@ def __init__(self, datap, case, typean, period): self.p_pdfnames = datap["analysis"][self.typean].get("pdf_names") self.p_param_names = datap["analysis"][self.typean].get("param_names") + # TODO: should come entirely from DB self.observables = { "qa": ["zg", "rg", "nsd", "zpar", "dr", "lntheta", "lnkt", "lntheta-lnkt"], "all": [*self.cfg("observables", {})], @@ -104,7 +95,6 @@ def __init__(self, datap, case, typean, period): self.fit_levels = self.cfg("fit_levels", ["mc", "data"]) self.fit_sigma = {} self.fit_mean = {} - self.fit_func_bkg = {} self.fit_range = {} self.hcandeff = {"pr": None, "np": None} self.hcandeff_gen = {} @@ -124,6 +114,7 @@ def __init__(self, datap, case, typean, period): for param, symbol in zip( ("mean", "sigma", "significance", "chi2"), ("#it{#mu}", "#it{#sigma}", "significance", "#it{#chi}^{2}"), + strict=False, ) } for level in self.fit_levels @@ -140,10 +131,8 @@ def __init__(self, datap, case, typean, period): self.file_out_histo = TFile(self.n_fileresult, "recreate") self.fitter = RooFitter() - self.roo_ws = {} - self.roo_ws_ptjet = {} - self.roows = {} - self.roows_ptjet = {} + self.roo_ws = {} # ROOT workspaces stored at various levels + self.roows = {} # ROOT workspaces at latest level # region helpers def _save_canvas(self, canvas, filename): @@ -487,17 +476,13 @@ def _fit_mass(self, hist, filename=None): # pylint: disable=too-many-branches,too-many-statements def fit(self): if not self.cfg("hfjet", True): - self.logger.info("Not fitting mass distributions for inclusive jets") return self.logger.info("Fitting inclusive mass distributions") gStyle.SetOptFit(1111) for level in self.fit_levels: self.fit_mean[level] = [None] * self.nbins self.fit_sigma[level] = [None] * self.nbins - self.fit_func_bkg[level] = [None] * self.nbins self.fit_range[level] = [None] * self.nbins - self.roo_ws[level] = [None] * self.nbins - self.roo_ws_ptjet[level] = [[None] * self.nbins for _ in range(10)] rfilename = self.n_filemass_mc if "mc" in level else self.n_filemass fitcfg = None self.logger.debug("Opening file %s.", rfilename) @@ -528,23 +513,22 @@ def fit(self): if h_invmass.GetEntries() < 100: # TODO: reconsider criterion self.logger.error("Not enough entries to fit %s iptjet %s ipt %d", level, iptjet, ipt) continue - fit_res, _, func_bkg = self._fit_mass( + fit_res, _, _ = self._fit_mass( h_invmass, f"fit/h_mass_fitted_{string_range_pthf(range_pthf)}_{level}.png" ) if fit_res and fit_res.Get() and fit_res.IsValid(): self.fit_mean[level][ipt] = fit_res.Parameter(1) self.fit_sigma[level][ipt] = fit_res.Parameter(2) - self.fit_func_bkg[level][ipt] = func_bkg else: self.logger.error("Fit failed for %s bin %d", level, ipt) if self.cfg("mass_roofit"): for entry in self.cfg("mass_roofit", []): - if lvl := entry.get("level"): - if lvl != level: - continue - if ptspec := entry.get("ptrange"): - if ptspec[0] > range_pthf[0] or ptspec[1] < range_pthf[1]: - continue + if (lvl := entry.get("level")) and lvl != level: + continue + if (ptspec := entry.get("ptrange")) and ( + ptspec[0] > range_pthf[0] or ptspec[1] < range_pthf[1] + ): + continue fitcfg = entry break self.logger.debug("Using fit config for %i: %s", ipt, fitcfg) @@ -559,7 +543,7 @@ def fit(self): if h_invmass.GetEntries() < 100: # TODO: reconsider criterion self.logger.error("Not enough entries to fit %s iptjet %s ipt %d", level, iptjet, ipt) continue - roows = self.roows.get(ipt) if iptjet is None else self.roows_ptjet.get((iptjet, ipt)) + roows = self.roows.get((iptjet, ipt)) if roows is None and level != self.fit_levels[0]: self.logger.critical( "missing previous fit result, cannot fit %s iptjet %s ipt %d", level, iptjet, ipt @@ -596,18 +580,17 @@ def fit(self): # roo_ws.Print() # TODO: save snapshot per level # roo_ws.saveSnapshot(level, None) - if iptjet is not None: - self.logger.debug("Setting roows_ptjet for %s iptjet %s ipt %d", level, iptjet, ipt) - self.roows_ptjet[(iptjet, ipt)] = roo_ws - self.roo_ws_ptjet[level][iptjet][ipt] = roo_ws - else: - self.logger.debug("Setting roows for %s iptjet %s ipt %d", level, iptjet, ipt) - self.roows[ipt] = roo_ws - self.roo_ws[level][ipt] = roo_ws - for jptjet in range(get_nbins(h, 1)): - self.roows_ptjet[(jptjet, ipt)] = roo_ws.Clone() - self.roo_ws_ptjet[level][jptjet][ipt] = roo_ws.Clone() - # TODO: take parameter names from DB + self.logger.info("Setting roows_ptjet for %s iptjet %s ipt %d", level, iptjet, ipt) + self.roows[(iptjet, ipt)] = roo_ws.Clone() + self.roo_ws[(level, iptjet, ipt)] = roo_ws.Clone() + if iptjet is None: + if not fitcfg.get("per_ptjet"): + for jptjet in range(get_nbins(h, 1)): + self.logger.info( + "Overwriting roows_ptjet for %s iptjet %s ipt %d", level, jptjet, ipt + ) + self.roows[(jptjet, ipt)] = roo_ws.Clone() + self.roo_ws[(level, jptjet, ipt)] = roo_ws.Clone() if level in ("data", "mc"): varname_mean = fitcfg.get("var_mean", self.p_param_names["gauss_mean"]) varname_sigma = fitcfg.get("var_sigma", self.p_param_names["gauss_sigma"]) @@ -626,8 +609,6 @@ def fit(self): ipt + 1, roo_ws.var(varname_sigma).getError() ) varname_m = fitcfg.get("var", "m") - if roo_ws.pdf("bkg"): - self.fit_func_bkg[level][ipt] = roo_ws.pdf("bkg").asTF(roo_ws.var(varname_m)) self.fit_range[level][ipt] = ( roo_ws.var(varname_m).getMin("fit"), roo_ws.var(varname_m).getMax("fit"), @@ -660,12 +641,12 @@ def _subtract_sideband(self, hist, var, mcordata, ipt): return None for entry in self.cfg("sidesub", []): - if level := entry.get("level"): - if level != mcordata: - continue - if ptrange_sel := entry.get("ptrange"): - if ptrange_sel[0] > self.bins_candpt[ipt] or ptrange_sel[1] < self.bins_candpt[ipt + 1]: - continue + if (level := entry.get("level")) and level != mcordata: + continue + if (ptrange_sel := entry.get("ptrange")) and ( + ptrange_sel[0] > self.bins_candpt[ipt] or ptrange_sel[1] < self.bins_candpt[ipt + 1] + ): + continue regcfg = entry["regions"] break regions = { @@ -699,7 +680,7 @@ def _subtract_sideband(self, hist, var, mcordata, ipt): fh = {} area = {} - var_m = self.roows[ipt].var("m") + var_m = self.roows[(None, ipt)].var("m") for region in regions: # project out the mass regions (first axis) axes = list(range(get_dim(hist)))[1:] @@ -716,56 +697,37 @@ def _subtract_sideband(self, hist, var, mcordata, ipt): [fh["sideband_left"], fh["sideband_right"]], f"h_ptjet{label}_sideband_{ipt}_{mcordata}" ) ensure_sumw2(fh_sideband) - - subtract_sidebands = False - if mcordata == "data" and self.cfg("sidesub_per_ptjet"): - self.logger.info("Subtracting sidebands in pt jet bins") - for iptjet in range(get_nbins(fh_subtracted, 0)): - if rws := self.roo_ws_ptjet[mcordata][iptjet][ipt]: - f = rws.pdf("bkg").asTF(self.roo_ws[mcordata][ipt].var("m")) - else: - self.logger.error("Could not retrieve roows for %s-%i-%i", mcordata, iptjet, ipt) - continue + if mcordata == "data": + bins_ptjet = list(range(get_nbins(fh_subtracted, 0))) if self.cfg("sidesub_per_ptjet") else [None] + self.logger.info("Scaling sidebands in ptjet-%s bins: %s using %s", label, bins_ptjet, fh_sideband) + hx = project_hist(fh_sideband, (0,), {}) if get_dim(fh_sideband) > 1 else fh_sideband + for iptjet in bins_ptjet: + if iptjet: + n = hx.GetBinContent(iptjet) + self.logger.info("Need to scale in ptjet %i: %g", iptjet, n) + if n <= 0: + continue + rws = self.roo_ws.get((mcordata, iptjet, ipt)) + if not rws: + self.logger.error("Falling back to incl. roows for %s-iptjet%i-ipt%i", mcordata, iptjet, ipt) + rws = self.roo_ws.get((mcordata, None, ipt)) + if not rws: + self.logger.critical("Could not retrieve roows for %s-iptjet%i-ipt%i", mcordata, iptjet, ipt) + f = rws.pdf("bkg").asTF(rws.var("m")) area = {region: f.Integral(*limits[region]) for region in regions} - self.logger.info( - "areas for %s-%s: %g, %g, %g", - mcordata, - ipt, - area["signal"], - area["sideband_left"], - area["sideband_right"], - ) + self.logger.info("areas for %s-iptjet%s-ipt%s: %s", mcordata, iptjet, ipt, area) if (area["sideband_left"] + area["sideband_right"]) > 0.0: - subtract_sidebands = True areaNormFactor = area["signal"] / (area["sideband_left"] + area["sideband_right"]) - # TODO: extend to higher dimensions - for ibin in range(get_nbins(fh_subtracted, 1)): - scale_bin(fh_sideband, areaNormFactor, iptjet + 1, ibin + 1) - else: - for region in regions: - f = self.roo_ws[mcordata][ipt].pdf("bkg").asTF(self.roo_ws[mcordata][ipt].var("m")) - area[region] = f.Integral(*limits[region]) - - self.logger.info( - "areas for %s-%s: %g, %g, %g", - mcordata, - ipt, - area["signal"], - area["sideband_left"], - area["sideband_right"], - ) - - if (area["sideband_left"] + area["sideband_right"]) > 0.0: - subtract_sidebands = True - areaNormFactor = area["signal"] / (area["sideband_left"] + area["sideband_right"]) - fh_sideband.Scale(areaNormFactor) - + # TODO: generalize and extend to higher dimensions + if iptjet is None: + fh_sideband.Scale(areaNormFactor) + else: + for ibin in range(get_nbins(fh_subtracted, 1)): + scale_bin(fh_sideband, areaNormFactor, iptjet + 1, ibin + 1) + fh_subtracted.Add(fh_sideband, -1.0) self._save_hist(fh_sideband, f"sideband/h_ptjet{label}_sideband_{string_range_pthf(range_pthf)}_{mcordata}.png") - if subtract_sidebands: - fh_subtracted.Add(fh_sideband, -1.0) self._clip_neg(fh_subtracted) - self._save_hist( fh_subtracted, f"sideband/h_ptjet{label}_subtracted_notscaled_{string_range_pthf(range_pthf)}_{mcordata}.png", @@ -802,7 +764,7 @@ def _subtract_sideband(self, hist, var, mcordata, ipt): self._save_canvas(c, filename) # TODO: calculate per ptjet bin - roows = self.roows[ipt] + roows = self.roows[(None, ipt)] roows.var("mean").setVal(self.fit_mean[mcordata][ipt]) roows.var("sigma_g1").setVal(self.fit_sigma[mcordata][ipt]) var_m.setRange("signal", *limits["signal"]) @@ -810,9 +772,9 @@ def _subtract_sideband(self, hist, var, mcordata, ipt): var_m.setRange("sider", *limits["sideband_right"]) # correct for reflections if self.cfg("corr_refl") and (mcordata == "data" or not self.cfg("closure.filter_reflections")): - pdf_sig = self.roows[ipt].pdf("sig") - pdf_refl = self.roows[ipt].pdf("refl") - pdf_bkg = self.roows[ipt].pdf("bkg") + pdf_sig = self.roows[(None, ipt)].pdf("sig") + pdf_refl = self.roows[(None, ipt)].pdf("refl") + pdf_bkg = self.roows[(None, ipt)].pdf("bkg") frac_sig = roows.var("frac").getVal() if mcordata == "data" else 1.0 frac_bkg = 1.0 - frac_sig fac_sig = frac_sig * (1.0 - roows.var("frac_refl").getVal()) @@ -862,9 +824,9 @@ def _subtract_sideband(self, hist, var, mcordata, ipt): self.h_reflcorr.SetBinContent(ipt + 1, corr) fh_subtracted.Scale(corr) - pdf_sig = self.roows[ipt].pdf("sig") + pdf_sig = self.roows[(None, ipt)].pdf("sig") frac_sig = pdf_sig.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), ROOT.RooFit.Range("signal")).getVal() - if pdf_peak := self.roows[ipt].pdf("peak"): + if pdf_peak := self.roows[(None, ipt)].pdf("peak"): frac_peak = pdf_peak.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), ROOT.RooFit.Range("signal")).getVal() self.logger.info( "correcting %s-%i for fractional signal area: %g (Gaussian: %g)", mcordata, ipt, frac_sig, frac_peak