diff --git a/machine_learning_hep/multitrial/README.md b/machine_learning_hep/multitrial/README.md new file mode 100644 index 0000000000..8437924336 --- /dev/null +++ b/machine_learning_hep/multitrial/README.md @@ -0,0 +1,38 @@ +# Multitrial systematics with MLHEP + +## Generate configurations (MLHEP yml databases) for each trial + +File: `run-mlhep-fitter-multitrial.py`
+Usage: `python run-mlhep-fitter-multitrial.py database_file in_db_dir out_db_dir mlhep_results_dir_pattern` + +Arguments: +- `database_file`: filename of the template database without the .yml extension, e.g., `database_ml_parameters_LcToPKPi` +- `in_db_dir`: path to the directory containing the database, e.g., `data/data_run3` +- `out_db_dir`: path to the directory for output multitrial databases, e.g., `multitrial_db` +- `mlhep_results_dir_pattern`: prefix of output directory name for fit results; for each trial, the trial name is appended to the directory name, and the resulting directory name is written under `Run3analysis/{data,mc}/prefix_dir_res` in the database file + +Adjust `DIR_PATH` in the script. It is the path to the base directory where you store directories with MLHEP results. + +This script needs to be ran only once to generate databases. + +Currently, the trials are hardcoded in the Python script. To add or modify a trial, you need to adjust `BASE_TRIALS` variable and the `process_trial` function. + +## Get mass fits for each trial + +File: `run-mlhep-fitter-multitrial.sh`
+Usage: `./run-mlhep-fitter-multitrial.sh` + +The `submission/analyzer.yml` config is used. +The script automates running MLHEP for each trial. Mass histograms are copied before each MLHEP invocation, so as only the quick fit steps needs to be activated in `submission/analyzer.yml` + +Adjust the variables before the `for` loop.
+The script includes also a call to `run-mlhep-fitter-multitrial.py`, which can be commented out. In this case, make sure to pass the same `OUT_DB_DIR`, `DB_PATTERN`, `DIR_PATTERN` values to the two scripts. + +Before running, you need to create the directory structure for each MLHEP output. You can, for example, run the `.sh` script with the `cp` lines commented out. Then, MLHEP creates directories for each trial and fails quietly. Next, run the script with `cp` lines uncommented, and you will get the final output. + +## Plot multitrial results + +Files: `multitrial.py`, `config_multitrial.json`
+Usage: `python3 multitrial.py config_multitrial.json` + +Adjust the sample `config_multitrial.json` to your needs. diff --git a/machine_learning_hep/multitrial/config_multitrial.json b/machine_learning_hep/multitrial/config_multitrial.json new file mode 100644 index 0000000000..35aa25a32e --- /dev/null +++ b/machine_learning_hep/multitrial/config_multitrial.json @@ -0,0 +1,20 @@ +{ + "file_pattern": "/data8/majak/MLHEP/results-24022025-newtrain-multitrial-prompt*/LHC23pp_pass4/Results/resultsdatatot/yields_LcpKpi_Run3analysis.root", + "_file_pattern": "regex pattern for all multitrial fit files; note the asterisk to match all trial suffixes", + "dir_pattern": "results-24022025-newtrain-multitrial-prompt", + "_dir_pattern": "the base directory prefix from the file pattern above", + "histoname": "hyields0", + "_histoname": "histogram with mass fit", + "sel_histoname": "hchi0", + "_sel_histoname": "histogram for filtering the results", + "selection": "lambda x : x < 5.0", + "_selection": "filter to apply with sel_histoname, e.g., chi < 5", + "pt_bins_min": [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 16], + "pt_bins_max": [2, 3, 4, 5, 6, 7, 8, 10, 12, 16, 24], + "central_trial": "", + "_central_trial": "suffix of the directory with the central trial", + "outdir": "/data8/majak/multitrial", + "_outdir": "output directory", + "outfile": "result-prompt-chi5", + "_outfile": "output file pattern" +} diff --git a/machine_learning_hep/multitrial/multitrial.py b/machine_learning_hep/multitrial/multitrial.py new file mode 100644 index 0000000000..593a99dd31 --- /dev/null +++ b/machine_learning_hep/multitrial/multitrial.py @@ -0,0 +1,187 @@ +# pylint: disable=missing-function-docstring, invalid-name +""" +file: multitrial.py +brief: Plot multitrial systematics based on multiple fit trials, one file per trial. +usage: python3 multitrial.py config_multitrial.json +author: Maja Karwowska , Warsaw University of Technology +""" +import argparse +import glob +import json +import re +import numpy as np +import matplotlib.pyplot as plt +from matplotlib.ticker import MultipleLocator, AutoMinorLocator + +from ROOT import ( # pylint: disable=import-error,no-name-in-module + TFile, + gROOT, +) + + +def plot_text_box(ax, text): + ax.text(0.98, 0.97, text, + horizontalalignment="right", verticalalignment="top", + fontsize=40, va="top", transform=ax.transAxes, + bbox={"edgecolor": "black", "fill": False}) + + +def get_yields(cfg): + filenames = sorted(glob.glob(cfg["file_pattern"]), + key=lambda filename: re.split("/", filename)[-2]) + yields = {} + yields_err = {} + trials = {} + chis = {} + for pt_bin_min, pt_bin_max in zip(cfg["pt_bins_min"], cfg["pt_bins_max"]): + yields[f"{pt_bin_min}_{pt_bin_max}"] = [] + yields_err[f"{pt_bin_min}_{pt_bin_max}"] = [] + trials[f"{pt_bin_min}_{pt_bin_max}"] = [] + chis[f"{pt_bin_min}_{pt_bin_max}"] = [] + for filename in filenames: + print(f"Reading {filename}") + with TFile.Open(filename) as fin: + hist = fin.Get(cfg["histoname"]) + hist_sel = fin.Get(cfg["sel_histoname"]) + if hist.ClassName() != "TH1F": + print(f"No hist in {filename}") + if hist_sel.ClassName() != "TH1F": + print(f"No hist sel in {filename}") + dirname = re.split("/", filename)[4] # [-2] for D2H fitter + trial_name = dirname.replace(cfg["dir_pattern"], "") + for ind, (pt_bin_min, pt_bin_max) in enumerate(zip(cfg["pt_bins_min"], + cfg["pt_bins_max"])): + if eval(cfg["selection"])(hist_sel.GetBinContent(ind + 1)) \ + and hist.GetBinContent(ind + 1) > 1.0 : # pylint: disable=eval-used + yields[f"{pt_bin_min}_{pt_bin_max}"].append(hist.GetBinContent(ind + 1)) + yields_err[f"{pt_bin_min}_{pt_bin_max}"].append(hist.GetBinError(ind + 1)) + trials[f"{pt_bin_min}_{pt_bin_max}"].append(trial_name) + chis[f"{pt_bin_min}_{pt_bin_max}"].append(hist_sel.GetBinContent(ind + 1)) + else: + print(f"Rejected: {hist_sel.GetBinContent(ind + 1)} {trial_name} "\ + f"pt: {pt_bin_min}, {pt_bin_max}") + if hist.GetBinContent(ind + 1) < 1.0: + print("Yield 0") + return yields, yields_err, trials, chis + + +def prepare_figure(y_label, ticks): + fig = plt.figure(figsize=(20, 15)) + ax = plt.subplot(1, 1, 1) + ax.set_xlabel("Trial #", fontsize=20) + ax.set_ylabel(y_label, fontsize=20) + ax.tick_params(which="both", width=2.5, direction="in") + ax.tick_params(which="major", labelsize=20, length=15) + ax.tick_params(which="minor", length=7) + ax.xaxis.set_major_locator(MultipleLocator(ticks)) + ax.xaxis.set_minor_locator(AutoMinorLocator(5)) + ax.yaxis.set_minor_locator(AutoMinorLocator(5)) + return fig, ax + + +def set_ax_limits(ax, pt_string, values): + ax.margins(0.01, 0.2) + np_values = np.array(values, dtype="float32") + if ax.get_ylim()[1] - ax.get_ylim()[0] > 30.0 * np.std(np_values): + ax.set_ylim(np.mean(np_values) - 10.0 * np.std(np_values), + np.mean(np_values) + 10.0 * np.std(np_values)) + print(f"{pt_string} narrowing down the axis to {ax.get_ylim()}") + + +def plot_trial_line(ax, central_trial_ind): + axis_lim = ax.get_ylim() + y_axis = np.linspace(*axis_lim, 100) + ax.plot([central_trial_ind] * len(y_axis), y_axis, c="m", ls="--", linewidth=4.0) + ax.set_ylim(*axis_lim) + + +def plot_yields_trials(yields, yields_err, trials, cfg, pt_string, plot_pt_string, + central_trial_ind, central_yield): + fig, ax = prepare_figure("Raw yield", 100) + x_axis = range(len(trials)) + ax.errorbar(x_axis, yields, yerr=yields_err, + fmt="o", c="b", elinewidth=2.5, linewidth=4.0) + set_ax_limits(ax, pt_string, yields) + central_line = np.array([central_yield] * len(x_axis), dtype="float32") + ax.plot(x_axis, central_line, c="orange", ls="--", linewidth=4.0) + central_err = np.array([yields_err[central_trial_ind]] * len(x_axis), dtype="float32") + ax.fill_between(x_axis, central_line - central_err, central_line + central_err, + facecolor="orange", edgecolor="none", alpha=0.3) + plot_trial_line(ax, central_trial_ind) + plot_text_box(ax, plot_pt_string) + fig.savefig(f'{cfg["outdir"]}/{cfg["outfile"]}_yields_trials_{pt_string}.png', + bbox_inches='tight') + plt.close() + + +def plot_chis(chis, cfg, pt_string, plot_pt_string): + fig, ax = prepare_figure("Chi2/ndf", 100) + x_axis = range(len(chis)) + ax.scatter(x_axis, chis, c="b", marker="o") + set_ax_limits(ax, pt_string, chis) + plot_text_box(ax, plot_pt_string) + fig.savefig(f'{cfg["outdir"]}/{cfg["outfile"]}_chis_{pt_string}.png', + bbox_inches='tight') + plt.close() + + +def plot_yields_distr(yields, cfg, pt_string, plot_pt_string, central_trial_ind, central_yield): + plt.figure(figsize=(20, 15)) + ax = plt.subplot(1, 1, 1) + ax.set_xlabel("Ratio", fontsize=20) + ax.tick_params(labelsize=20, length=7, width=2.5) + ratios = [yield_ / central_yield for ind, yield_ in enumerate(yields) \ + if ind != central_trial_ind] + ax.hist(ratios, color="b", linewidth=4.0) + mean = np.mean(yields) + std_dev = np.std(yields) + diffs = [(yield_ - central_yield) / central_yield \ + for yield_ in yields[:central_trial_ind]] + diffs.extend([(yield_ - central_yield) / central_yield \ + for yield_ in yields[central_trial_ind+1:]]) + rmse = np.sqrt(np.mean(np.array(diffs, dtype="float32")**2)) + plot_text_box(ax, f"{plot_pt_string}\n"\ + f"mean: {mean:.0f}\n"\ + f"std dev: {std_dev:.2f}\n"\ + f"RMSE: {rmse:.2f}\n"\ + f"#trials: {len(yields)}") + plt.savefig(f'{cfg["outdir"]}/{cfg["outfile"]}_distr_{pt_string}.png', bbox_inches='tight') + plt.close() + + +def main(): + gROOT.SetBatch(True) + + parser = argparse.ArgumentParser(description="Arguments to pass") + parser.add_argument("config", help="JSON config file") + args = parser.parse_args() + + with open(args.config, encoding="utf8") as fil: + cfg = json.load(fil) + + yields, yields_err, trials, chis = get_yields(cfg) + + for pt_bin_min, pt_bin_max in zip(cfg["pt_bins_min"], cfg["pt_bins_max"]): + plot_pt_string = f"${pt_bin_min} < p_\\mathrm{{T}}/(\\mathrm{{GeV}}/c) < {pt_bin_max}$" + pt_string = f"{pt_bin_min}_{pt_bin_max}" + + try: + central_trial_ind = trials[pt_string].index(cfg["central_trial"]) + central_yield = yields[pt_string][central_trial_ind] + + plot_yields_trials(yields[pt_string], yields_err[pt_string], trials[pt_string], cfg, + pt_string, plot_pt_string, central_trial_ind, central_yield) + plot_yields_distr(yields[pt_string], cfg, pt_string, plot_pt_string, + central_trial_ind, central_yield) + plot_chis(chis[pt_string], cfg, pt_string, plot_pt_string) + except: # pylint: disable=bare-except + pass + + with open(f'{cfg["outdir"]}/{cfg["outfile"]}_trials_{pt_string}.txt', + "w", encoding="utf-8") as ftext: + for trial in trials[pt_string]: + ftext.write(f"{trial}\n") + + +if __name__ == "__main__": + main() diff --git a/machine_learning_hep/multitrial/run-mlhep-fitter-multitrial.py b/machine_learning_hep/multitrial/run-mlhep-fitter-multitrial.py new file mode 100644 index 0000000000..d8307def4e --- /dev/null +++ b/machine_learning_hep/multitrial/run-mlhep-fitter-multitrial.py @@ -0,0 +1,160 @@ +# pylint: disable=missing-function-docstring, invalid-name +""" +file: run-mlhep-fitter-multitrial.py +brief: Prepare MLHEP database files for different fit configurations for multitrial systematics. +usage: python run-mlhep-fitter-multitrial.py database_lc data/data_run3 trial_configs_dir mlhep_results_dir_pattern +author: Maja Karwowska , Warsaw University of Technology +""" + +import argparse +import re +import shutil +import yaml + +SIGMA02="0.007, 0.007, 0.013" +SIGMA23="0.007, 0.007, 0.013" +SIGMA34="0.007, 0.007, 0.012" +SIGMA45="0.008, 0.008, 0.016" +SIGMA56="0.010, 0.010, 0.016" +SIGMA67="0.008, 0.008, 0.017" +SIGMA78="0.012, 0.012, 0.018" +SIGMA810="0.015, 0.012, 0.018" +SIGMA1012="0.010, 0.010, 0.022" +SIGMA1216="0.016, 0.016, 0.029" +SIGMA1624="0.016, 0.016, 0.029" +FREE_SIGMAS=[SIGMA02, SIGMA23, SIGMA34, SIGMA45, SIGMA56, SIGMA67, SIGMA78, + SIGMA810, SIGMA1012, SIGMA1216, SIGMA1624] + +CENTRAL_TRIAL="" + +BASE_TRIALS = ( + ["alpha-15%", "alpha+15%"], + ["n-15%", "n+15%"], + ["rebin-1", "rebin+1"], + ["free-sigma"], + ["poly3"], + ["narrow", "narrow2", "wide", "wide2"] +) + +DIR_PATH = "/data8/majak/MLHEP" + +def generate_trials(trial_classes): + combinations = [""] + for trial_class in trial_classes: + class_comb = [] + for cur_comb in combinations: + for trial in trial_class: + class_comb.append(cur_comb + "_" + trial) + #print(f"{cur_comb}_{trial}") + combinations.extend(class_comb) + return combinations + +def replace_with_reval(var, in_str, frac): + pattern = fr"{var}\[([0-9.]*), .*?\]" + values = re.findall(pattern, in_str) + new_val = round(float(values[0]) * frac, 3) + return re.sub(pattern, f"{var}[{new_val}, {new_val}]", in_str) + +def process_trial(trial, ana_cfg, data_cfg, mc_cfg): + fit_cfg = ana_cfg["mass_roofit"] + if "alpha-15%" in trial: + print("Processing alpha-15%") + for pt_cfg in mc_cfg: + sig_fn = pt_cfg["components"]["sig"]["fn"] + pt_cfg["components"]["sig"]["fn"] = replace_with_reval("alpha1", sig_fn, 0.85) + elif "alpha+15%" in trial: + print("Processing alpha+15%") + for pt_cfg in mc_cfg: + sig_fn = pt_cfg["components"]["sig"]["fn"] + pt_cfg["components"]["sig"]["fn"] = replace_with_reval("alpha1", sig_fn, 1.15) + elif "n-15%" in trial: + print("Processing n-15%") + for pt_cfg in mc_cfg: + sig_fn = pt_cfg["components"]["sig"]["fn"] + pt_cfg["components"]["sig"]["fn"] = replace_with_reval("n1", sig_fn, 0.85) + elif "n+15%" in trial: + print("Processing n+15%") + for pt_cfg in mc_cfg: + sig_fn = pt_cfg["components"]["sig"]["fn"] + pt_cfg["components"]["sig"]["fn"] = replace_with_reval("n1", sig_fn, 1.15) + elif "rebin-1" in trial: + print("Processing rebin-1") + ana_cfg["n_rebin"] = [rebin - 1 for rebin in ana_cfg["n_rebin"]] + elif "rebin+1" in trial: + print("Processing rebin+1") + ana_cfg["n_rebin"] = [rebin + 1 for rebin in ana_cfg["n_rebin"]] + elif "free-sigma" in trial: + print("Processing free-sigma") + for pt_cfg, free_sigma in zip(mc_cfg, FREE_SIGMAS): + sig_fn = pt_cfg["components"]["sig"]["fn"] + pt_cfg["components"]["sig"]["fn"] = re.sub(r"sigma_g1\[(.*?)\]", + f"sigma_g1[{free_sigma}]", sig_fn) + elif "poly3" in trial: + print("Processing poly3") + for pt_cfg in data_cfg: + bkg_fn = pt_cfg["components"]["bkg"]["fn"] + pt_cfg["components"]["bkg"]["fn"] = re.sub(r"a2\[(.*?)\]", + r"a2[\1], a3[-1e8, 1e8]", bkg_fn) + elif "narrow2" in trial: + print("Processing narrow2") + for pt_cfg in fit_cfg: + pt_cfg["range"] = [pt_cfg["range"][0] + 0.02, pt_cfg["range"][1] - 0.02] + elif "narrow" in trial: + print("Processing narrow") + for pt_cfg in fit_cfg: + pt_cfg["range"] = [pt_cfg["range"][0] + 0.01, pt_cfg["range"][1] - 0.01] + elif "wide2" in trial: + print("Processing wide2") + for pt_cfg in fit_cfg: + pt_cfg["range"] = [max(2.10, pt_cfg["range"][0] - 0.02), + min(2.47, pt_cfg["range"][1] + 0.02)] + elif "wide" in trial: + print("Processing wide") + for pt_cfg in fit_cfg: + pt_cfg["range"] = [max(2.10, pt_cfg["range"][0] - 0.01), + min(2.47, pt_cfg["range"][1] + 0.01)] + + +def main(db, db_dir, out_db_dir, resdir_pattern): + db_ext=f"{db}.yml" + db_path=f"{db_dir}/{db_ext}" + combinations = generate_trials(BASE_TRIALS) + + for comb in combinations: + print(comb) + + cur_cfg = f"{out_db_dir}/{db}{comb}.yml" + shutil.copy2(db_path, cur_cfg) + + with open(cur_cfg, encoding="utf-8") as stream: + cfg = yaml.safe_load(stream) + + ana_cfg = cfg["LcpKpi"]["analysis"]["Run3analysis"] + fit_cfg = ana_cfg["mass_roofit"] + mc_cfg = [fit_params for fit_params in fit_cfg \ + if "level" in fit_params and fit_params["level"] == "mc"] + data_cfg = [fit_params for fit_params in fit_cfg if not "level" in fit_params] + + resdir = f"{resdir_pattern}{comb}" + respath = f"{DIR_PATH}/{resdir}/" + ana_cfg["data"]["prefix_dir_res"] = respath + ana_cfg["mc"]["prefix_dir_res"] = respath + + trials = comb.split("_") + + for trial in trials: + process_trial(trial, ana_cfg, data_cfg, mc_cfg) + + with open(cur_cfg, "w", encoding="utf-8") as stream: + yaml.dump(cfg, stream, sort_keys=False, width=10000, default_flow_style=None) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Arguments to pass") + parser.add_argument("db", help="MLHEP database without extension") + parser.add_argument("db_dir", help="path to directory with MLHEP database") + parser.add_argument("out_db_dir", help="path to output directory for generated MLHEP databases") + parser.add_argument("resdir", help="MLHEP resdir pattern") + args = parser.parse_args() + + main(args.db, args.db_dir, args.out_db_dir, args.resdir) diff --git a/machine_learning_hep/multitrial/run-mlhep-fitter-multitrial.sh b/machine_learning_hep/multitrial/run-mlhep-fitter-multitrial.sh new file mode 100644 index 0000000000..4006d0790a --- /dev/null +++ b/machine_learning_hep/multitrial/run-mlhep-fitter-multitrial.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +DB_PATTERN="database_ml_parameters_LcToPKPi_multiclass_fdd" # Original database to be used as template +DB_DIR="data/data_run3" +OUT_DB_DIR="multitrial-db" # Directory to store multitrial databases only +ext=".yml" + +DIR_PATH="/data8/majak/MLHEP" +DIR_PATTERN="results-24022025-newtrain-multitrial-prompt" # Prefix of output directory for fit results + +# Paths to input masshistos to fit +BASE_DIR="/data8/majak/MLHEP/results-24022025-newtrain-ptshape-prompt" +DATA_HIST="LHC23pp/Results/resultsdatatot/masshisto.root" +MC_HIST="LHC24pp_mc/Results/resultsmctot/masshisto.root" + +# Run this only once to generate databases +# Then, you can comment this out if you don't change the *.py file +# The output analysis dir is set in databases to DIR_PATTERN + suffix with trial name +python run-mlhep-fitter-multitrial.py "${DB_PATTERN}" "${DB_DIR}" "${OUT_DB_DIR}" "${DIR_PATTERN}" || exit 1 + +for db in "${OUT_DB_DIR}"/*.yml ; do + db_basename=$(basename "${db}") + db_basename_no_ext=${db_basename%%"${ext}"} + echo "${db_basename_no_ext}" + suffix=${db_basename_no_ext##"${DB_PATTERN}"} + echo "suffix: ${suffix}" + RESPATH="${DIR_PATH}/${DIR_PATTERN}${suffix}" + echo "respath: ${RESPATH}" + + # Copy base masshistos so as to skip the masshisto step + # Only the fit step needs to be activated in analyzer.yml + # You need first to create the directory trees + cp "${BASE_DIR}/${DATA_HIST}" "${RESPATH}/${DATA_HIST}" + cp "${BASE_DIR}/${MC_HIST}" "${RESPATH}/${MC_HIST}" + + mlhep "logfile_${db_basename}.log" \ + -a Run3analysis \ + --run-config submission/analyzer.yml \ + --database-analysis "${db}" +done +