From ff28a60bf1bdc0f3cde5cde919f20c59ad1e8936 Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Tue, 15 Jul 2025 11:59:58 +0200 Subject: [PATCH 1/5] Add multitrial scripts --- machine_learning_hep/multitrial.py | 188 ++++++++++++++++++ .../run-mlhep-fitter-multitrial.py | 158 +++++++++++++++ .../run-mlhep-fitter-multitrial.sh | 47 +++++ 3 files changed, 393 insertions(+) create mode 100644 machine_learning_hep/multitrial.py create mode 100644 machine_learning_hep/run-mlhep-fitter-multitrial.py create mode 100644 machine_learning_hep/run-mlhep-fitter-multitrial.sh diff --git a/machine_learning_hep/multitrial.py b/machine_learning_hep/multitrial.py new file mode 100644 index 0000000000..38119c95fa --- /dev/null +++ b/machine_learning_hep/multitrial.py @@ -0,0 +1,188 @@ +# pylint: disable=missing-function-docstring, invalid-name +""" +file: multitrial.py +brief: Plot multitrial systematics based on multiple fit trials, one file per trial. +usage: python3 multitrial.py config_multitrial.json +author: Maja Karwowska , Warsaw University of Technology +""" +import argparse +import glob +import json +import re +import numpy as np +import matplotlib.pyplot as plt +from matplotlib.ticker import MultipleLocator, AutoMinorLocator + +from ROOT import ( # pylint: disable=import-error,no-name-in-module + TFile, + gROOT, +) + + +def plot_text_box(ax, text): + ax.text(0.98, 0.97, text, + horizontalalignment="right", verticalalignment="top", + fontsize=40, va="top", transform=ax.transAxes, + bbox={"edgecolor": "black", "fill": False}) + + +def get_yields(cfg): + filenames = sorted(glob.glob(cfg["file_pattern"]), + key=lambda filename: re.split("/", filename)[-2]) + yields = {} + yields_err = {} + trials = {} + chis = {} + for pt_bin_min, pt_bin_max in zip(cfg["pt_bins_min"], cfg["pt_bins_max"]): + yields[f"{pt_bin_min}_{pt_bin_max}"] = [] + yields_err[f"{pt_bin_min}_{pt_bin_max}"] = [] + trials[f"{pt_bin_min}_{pt_bin_max}"] = [] + chis[f"{pt_bin_min}_{pt_bin_max}"] = [] + for filename in filenames: + print(f"Reading {filename}") + with TFile.Open(filename) as fin: + hist = fin.Get(cfg["histoname"]) + hist_sel = fin.Get(cfg["sel_histoname"]) + if hist.ClassName() != "TH1F": + print(f"No hist in {filename}") + if hist_sel.ClassName() != "TH1F": + print(f"No hist sel in {filename}") + dirname = re.split("/", filename)[4] # [-2] for D2H fitter + trial_name = dirname.replace(cfg["dir_pattern"], "") + for ind, (pt_bin_min, pt_bin_max) in enumerate(zip(cfg["pt_bins_min"], + cfg["pt_bins_max"])): + if eval(cfg["selection"])(hist_sel.GetBinContent(ind + 1)) \ + and hist.GetBinContent(ind + 1) > 1.0 : + yields[f"{pt_bin_min}_{pt_bin_max}"].append(hist.GetBinContent(ind + 1)) + yields_err[f"{pt_bin_min}_{pt_bin_max}"].append(hist.GetBinError(ind + 1)) + trials[f"{pt_bin_min}_{pt_bin_max}"].append(trial_name) + chis[f"{pt_bin_min}_{pt_bin_max}"].append(hist_sel.GetBinContent(ind + 1)) + else: + print(f"Rejected: {hist_sel.GetBinContent(ind + 1)} {trial_name} "\ + f"pt: {pt_bin_min}, {pt_bin_max}") + if hist.GetBinContent(ind + 1) < 1.0: + print("Yield 0") + return yields, yields_err, trials, chis + + +def prepare_figure(cfg, y_label, ticks): + fig = plt.figure(figsize=(20, 15)) + ax = plt.subplot(1, 1, 1) + ax.set_xlabel(cfg["x_axis"], fontsize=20) + ax.set_ylabel(y_label, fontsize=20) + ax.tick_params(which="both", width=2.5, direction="in") + ax.tick_params(which="major", labelsize=20, length=15) + ax.tick_params(which="minor", length=7) + ax.xaxis.set_major_locator(MultipleLocator(ticks)) + ax.xaxis.set_minor_locator(AutoMinorLocator(5)) + ax.yaxis.set_minor_locator(AutoMinorLocator(5)) + return fig, ax + + +def set_ax_limits(ax, pt_string, values, errs): + ax.margins(0.01, 0.2) + np_values = np.array(values, dtype="float32") + np_errs = np.array(errs, dtype="float32") + if ax.get_ylim()[1] - ax.get_ylim()[0] > 30.0 * np.std(np_values): + ax.set_ylim(np.mean(np_values) - 10.0 * np.std(np_values), + np.mean(np_values) + 10.0 * np.std(np_values)) + print(f"{pt_string} narrowing down the axis to {ax.get_ylim()}") + + +def plot_trial_line(ax, central_trial_ind): + axis_lim = ax.get_ylim() + y_axis = np.linspace(*axis_lim, 100) + ax.plot([central_trial_ind] * len(y_axis), y_axis, c="m", ls="--", linewidth=4.0) + ax.set_ylim(*axis_lim) + + +def plot_yields_trials(yields, yields_err, trials, cfg, pt_string, plot_pt_string, + central_trial_ind, central_yield): + fig, ax = prepare_figure(cfg, cfg["y_axis"], 100) + x_axis = range(len(trials)) + ax.errorbar(x_axis, yields, yerr=yields_err, + fmt="o", c="b", elinewidth=2.5, linewidth=4.0) + set_ax_limits(ax, pt_string, yields, yields_err) + central_line = np.array([central_yield] * len(x_axis), dtype="float32") + ax.plot(x_axis, central_line, c="orange", ls="--", linewidth=4.0) + central_err = np.array([yields_err[central_trial_ind]] * len(x_axis), dtype="float32") + ax.fill_between(x_axis, central_line - central_err, central_line + central_err, + facecolor="orange", edgecolor="none", alpha=0.3) + plot_trial_line(ax, central_trial_ind) + plot_text_box(ax, plot_pt_string) + fig.savefig(f'{cfg["outdir"]}/{cfg["outfile"]}_yields_trials_{pt_string}.png', + bbox_inches='tight') + plt.close() + + +def plot_chis(chis, cfg, pt_string, plot_pt_string): + fig, ax = prepare_figure(cfg, "Chi2/ndf", 100) + x_axis = range(len(chis)) + ax.scatter(x_axis, chis, c="b", marker="o") + set_ax_limits(ax, pt_string, chis, [0.0] * len(chis)) + plot_text_box(ax, plot_pt_string) + fig.savefig(f'{cfg["outdir"]}/{cfg["outfile"]}_chis_{pt_string}.png', + bbox_inches='tight') + plt.close() + + +def plot_yields_distr(yields, cfg, pt_string, plot_pt_string, central_trial_ind, central_yield): + plt.figure(figsize=(20, 15)) + ax = plt.subplot(1, 1, 1) + ax.set_xlabel("Ratio", fontsize=20) + ax.tick_params(labelsize=20, length=7, width=2.5) + ratios = [yield_ / central_yield for ind, yield_ in enumerate(yields) \ + if ind != central_trial_ind] + ax.hist(ratios, color="b", linewidth=4.0) + mean = np.mean(yields) + std_dev = np.std(yields) + diffs = [(yield_ - central_yield) / central_yield \ + for yield_ in yields[:central_trial_ind]] + diffs.extend([(yield_ - central_yield) / central_yield \ + for yield_ in yields[central_trial_ind+1:]]) + rmse = np.sqrt(np.mean(np.array(diffs, dtype="float32")**2)) + plot_text_box(ax, f"{plot_pt_string}\n"\ + f"mean: {mean:.0f}\n"\ + f"std dev: {std_dev:.2f}\n"\ + f"RMSE: {rmse:.2f}\n"\ + f"#trials: {len(yields)}") + plt.savefig(f'{cfg["outdir"]}/{cfg["outfile"]}_distr_{pt_string}.png', bbox_inches='tight') + plt.close() + + +def main(): + gROOT.SetBatch(True) + + parser = argparse.ArgumentParser(description="Arguments to pass") + parser.add_argument("config", help="JSON config file") + args = parser.parse_args() + + with open(args.config, encoding="utf8") as fil: + cfg = json.load(fil) + + yields, yields_err, trials, chis = get_yields(cfg) + + for pt_bin_min, pt_bin_max in zip(cfg["pt_bins_min"], cfg["pt_bins_max"]): + plot_pt_string = f"${pt_bin_min} < p_\\mathrm{{T}}/(\\mathrm{{GeV}}/c) < {pt_bin_max}$" + pt_string = f"{pt_bin_min}_{pt_bin_max}" + + try: + central_trial_ind = trials[pt_string].index(cfg["central_trial"]) + central_yield = yields[pt_string][central_trial_ind] + + plot_yields_trials(yields[pt_string], yields_err[pt_string], trials[pt_string], cfg, + pt_string, plot_pt_string, central_trial_ind, central_yield) + plot_yields_distr(yields[pt_string], cfg, pt_string, plot_pt_string, + central_trial_ind, central_yield) + plot_chis(chis[pt_string], cfg, pt_string, plot_pt_string) + except: + pass + + with open(f'{cfg["outdir"]}/{cfg["outfile"]}_trials_{pt_string}.txt', + "w", encoding="utf-8") as ftext: + for trial in trials[pt_string]: + ftext.write(f"{trial}\n") + + +if __name__ == "__main__": + main() diff --git a/machine_learning_hep/run-mlhep-fitter-multitrial.py b/machine_learning_hep/run-mlhep-fitter-multitrial.py new file mode 100644 index 0000000000..c5a06a1836 --- /dev/null +++ b/machine_learning_hep/run-mlhep-fitter-multitrial.py @@ -0,0 +1,158 @@ +# pylint: disable=missing-function-docstring, invalid-name +""" +file: run-mlhep-fitter-multitrial.py +brief: Prepare MLHEP database files for different fit configurations for multitrial systematics. +usage: python3 run-mlhep-fitter-multitrial.py +author: Maja Karwowska , Warsaw University of Technology +""" + +import argparse +import re +import shutil +import yaml + +SIGMA02="0.007, 0.007, 0.013" +SIGMA23="0.007, 0.007, 0.013" +SIGMA34="0.007, 0.007, 0.012" +SIGMA45="0.008, 0.008, 0.016" +SIGMA56="0.010, 0.010, 0.016" +SIGMA67="0.008, 0.008, 0.017" +SIGMA78="0.012, 0.012, 0.018" +SIGMA810="0.015, 0.012, 0.018" +SIGMA1012="0.010, 0.010, 0.022" +SIGMA1216="0.016, 0.016, 0.029" +SIGMA1624="0.016, 0.016, 0.029" +FREE_SIGMAS=[SIGMA02, SIGMA23, SIGMA34, SIGMA45, SIGMA56, SIGMA67, SIGMA78, + SIGMA810, SIGMA1012, SIGMA1216, SIGMA1624] + +CENTRAL_TRIAL="" + +BASE_TRIALS = ( + ["alpha-15%", "alpha+15%"], + ["n-15%", "n+15%"], + ["rebin-1", "rebin+1"], + ["free-sigma"], + ["poly3"], + ["narrow", "narrow2", "wide", "wide2"] +) + +def generate_trials(trial_classes): + combinations = [""] + for trial_class in trial_classes: + class_comb = [] + for cur_comb in combinations: + for trial in trial_class: + class_comb.append(cur_comb + "_" + trial) + #print(f"{cur_comb}_{trial}") + combinations.extend(class_comb) + return combinations + +def replace_with_reval(var, in_str, frac): + pattern = fr"{var}\[([0-9.]*), .*?\]" + values = re.findall(pattern, in_str) + new_val = round(float(values[0]) * frac, 3) + return re.sub(pattern, f"{var}[{new_val}, {new_val}]", in_str) + +def process_trial(trial, ana_cfg, data_cfg, mc_cfg): + fit_cfg = ana_cfg["mass_roofit"] + if "alpha-15%" in trial: + print("Processing alpha-15%") + for pt_cfg in mc_cfg: + sig_fn = pt_cfg["components"]["sig"]["fn"] + pt_cfg["components"]["sig"]["fn"] = replace_with_reval("alpha1", sig_fn, 0.85) + elif "alpha+15%" in trial: + print("Processing alpha+15%") + for pt_cfg in mc_cfg: + sig_fn = pt_cfg["components"]["sig"]["fn"] + pt_cfg["components"]["sig"]["fn"] = replace_with_reval("alpha1", sig_fn, 1.15) + elif "n-15%" in trial: + print("Processing n-15%") + for pt_cfg in mc_cfg: + sig_fn = pt_cfg["components"]["sig"]["fn"] + pt_cfg["components"]["sig"]["fn"] = replace_with_reval("n1", sig_fn, 0.85) + elif "n+15%" in trial: + print("Processing n+15%") + for pt_cfg in mc_cfg: + sig_fn = pt_cfg["components"]["sig"]["fn"] + pt_cfg["components"]["sig"]["fn"] = replace_with_reval("n1", sig_fn, 1.15) + elif "rebin-1" in trial: + print("Processing rebin-1") + ana_cfg["n_rebin"] = [rebin - 1 for rebin in ana_cfg["n_rebin"]] + elif "rebin+1" in trial: + print("Processing rebin+1") + ana_cfg["n_rebin"] = [rebin + 1 for rebin in ana_cfg["n_rebin"]] + elif "free-sigma" in trial: + print("Processing free-sigma") + for pt_cfg, free_sigma in zip(mc_cfg, FREE_SIGMAS): + sig_fn = pt_cfg["components"]["sig"]["fn"] + pt_cfg["components"]["sig"]["fn"] = re.sub(r"sigma_g1\[(.*?)\]", + f"sigma_g1[{free_sigma}]", sig_fn) + elif "poly3" in trial: + print("Processing poly3") + for pt_cfg in data_cfg: + bkg_fn = pt_cfg["components"]["bkg"]["fn"] + pt_cfg["components"]["bkg"]["fn"] = re.sub(r"a2\[(.*?)\]", + r"a2[\1], a3[-1e8, 1e8]", bkg_fn) + elif "narrow2" in trial: + print("Processing narrow2") + for pt_cfg in fit_cfg: + pt_cfg["range"] = [pt_cfg["range"][0] + 0.02, pt_cfg["range"][1] - 0.02] + elif "narrow" in trial: + print("Processing narrow") + for pt_cfg in fit_cfg: + pt_cfg["range"] = [pt_cfg["range"][0] + 0.01, pt_cfg["range"][1] - 0.01] + elif "wide2" in trial: + print("Processing wide2") + for pt_cfg in fit_cfg: + pt_cfg["range"] = [max(2.10, pt_cfg["range"][0] - 0.02), + min(2.47, pt_cfg["range"][1] + 0.02)] + elif "wide" in trial: + print("Processing wide") + for pt_cfg in fit_cfg: + pt_cfg["range"] = [max(2.10, pt_cfg["range"][0] - 0.01), + min(2.47, pt_cfg["range"][1] + 0.01)] + + +def main(db, db_dir, out_db_dir, resdir_pattern): + db_ext=f"{db}.yml" + db_path=f"{db_dir}/{db_ext}" + combinations = generate_trials(BASE_TRIALS) + + for comb in combinations: + print(comb) + + cur_cfg = f"{out_db_dir}/{db}{comb}.yml" + shutil.copy2(db_path, cur_cfg) + + with open(cur_cfg, encoding="utf-8") as stream: + cfg = yaml.safe_load(stream) + + ana_cfg = cfg["LcpKpi"]["analysis"]["Run3analysis"] + fit_cfg = ana_cfg["mass_roofit"] + mc_cfg = [fit_params for fit_params in fit_cfg \ + if "level" in fit_params and fit_params["level"] == "mc"] + data_cfg = [fit_params for fit_params in fit_cfg if not "level" in fit_params] + + resdir = f"{resdir_pattern}{comb}" + respath = f"/data8/majak/MLHEP/{resdir}/" + ana_cfg["data"]["prefix_dir_res"] = respath + ana_cfg["mc"]["prefix_dir_res"] = respath + + trials = comb.split("_") + + for trial in trials: + process_trial(trial, ana_cfg, data_cfg, mc_cfg) + + with open(cur_cfg, "w", encoding="utf-8") as stream: + yaml.dump(cfg, stream, sort_keys=False, width=10000, default_flow_style=None) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Arguments to pass") + parser.add_argument("db", help="MLHEP database without extension") + parser.add_argument("db_dir", help="path to directory with MLHEP database") + parser.add_argument("out_db_dir", help="path to output directory for generated MLHEP databases") + parser.add_argument("resdir", help="MLHEP resdir pattern") + args = parser.parse_args() + + main(args.db, args.db_dir, args.out_db_dir, args.resdir) diff --git a/machine_learning_hep/run-mlhep-fitter-multitrial.sh b/machine_learning_hep/run-mlhep-fitter-multitrial.sh new file mode 100644 index 0000000000..26f25972af --- /dev/null +++ b/machine_learning_hep/run-mlhep-fitter-multitrial.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +DB_PATTERN="database_ml_parameters_LcToPKPi_multiclass_fdd" # Original database to be used as template +DB_DIR="data/data_run3" +OUT_DB_DIR="multitrial-db" # Directory to store multitrial databases only +ext=".yml" + +DIR_PATH="/data8/majak/MLHEP" +DIR_PATTERN="results-24022025-newtrain-multitrial-prompt" # Prefix of output directory for fit results + +# Paths to masshistos to fit +BASE_DIR="/data8/majak/MLHEP/results-24022025-newtrain-ptshape-prompt" +DATA_HIST="LHC23pp/Results/resultsdatatot/masshisto.root" +MC_HIST="LHC24pp_mc/Results/resultsmctot/masshisto.root" + +# Run this only once to generate databases +# Then, you can comment this out if you don't change the *.py file +# The output analysis dir is set in databases to DIR_PATTERN + suffix with trial name +python run-mlhep-fitter-multitrial.py "${DB_PATTERN}" "${DB_DIR}" "${OUT_DB_DIR}" "${DIR_PATTERN}" || exit 1 + +for db in ${OUT_DB_DIR}/*.yml ; do + db_basename=`basename ${db}` + db_basename_no_ext=${db_basename%%${ext}} + echo ${db_basename_no_ext} + suffix=${db_basename_no_ext##${DB_PATTERN}} + echo "suffix: ${suffix}" + RESPATH="${DIR_PATH}/${DIR_PATTERN}${suffix}" + echo "respath: ${RESPATH}" + + # Copy base masshistos so as to skip the masshisto step + # Only the fit step needs to be activated in analyzer.yml + # You need first to create the directory trees + cp "${BASE_DIR}/${DATA_HIST}" "${RESPATH}/${DATA_HIST}" + cp "${BASE_DIR}/${MC_HIST}" "${RESPATH}/${MC_HIST}" + + mlhep logfile_${db_basename}.log \ + -a Run3analysis \ + --run-config submission/analyzer.yml \ + --database-analysis ${db} + + # Copy the plots from MachineLearningHEP/machine_learning_hep/fig/ to RESPATH + # It's not compulsory, it's just for you to see the fits + # It might be obsolete if you changed the default output fig/ location in MLHEP + rm -rf ${RESPATH}/fig/ + mv fig/ ${RESPATH}/fig/ +done + From 7c0d2c2286c86da85cc02ea98ea20225f66c6236 Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Tue, 15 Jul 2025 12:45:48 +0200 Subject: [PATCH 2/5] Add multitrial scripts and README --- machine_learning_hep/multitrial/README.md | 36 +++++++++++++++++++ .../{ => multitrial}/multitrial.py | 4 +-- .../run-mlhep-fitter-multitrial.py | 6 ++-- .../run-mlhep-fitter-multitrial.sh | 8 +---- 4 files changed, 43 insertions(+), 11 deletions(-) create mode 100644 machine_learning_hep/multitrial/README.md rename machine_learning_hep/{ => multitrial}/multitrial.py (98%) rename machine_learning_hep/{ => multitrial}/run-mlhep-fitter-multitrial.py (96%) rename machine_learning_hep/{ => multitrial}/run-mlhep-fitter-multitrial.sh (83%) diff --git a/machine_learning_hep/multitrial/README.md b/machine_learning_hep/multitrial/README.md new file mode 100644 index 0000000000..93054c751c --- /dev/null +++ b/machine_learning_hep/multitrial/README.md @@ -0,0 +1,36 @@ +# Multitrial systematics with MLHEP + +## Generate configurations (MLHEP yml databases) for each trial + +File: `run-mlhep-fitter-multitrial.py`
+Usage: `python run-mlhep-fitter-multitrial.py database_file in_db_dir out_db_dir mlhep_results_dir_pattern` + +Arguments: +- `database_file`: filename of the template database without the .yml extension, e.g., `database_ml_parameters_LcToPKPi` +- `in_db_dir`: path to the directory containing the database, e.g., `data/data_run3` +- `out_db_dir`: path to the directory for output multitrial databases, e.g., `multitrial_db` +- `mlhep_results_dir_pattern`: prefix of output directory name for fit results; for each trial, the trial name is appended to the directory name, and the resulting directory name is written under `Run3analysis/{data,mc}/prefix_dir_res` in the database file + +Adjust `DIR_PATH` in the script. It is the path to the base directory where you store directories with MLHEP results. + +This script needs to be ran only once to generate databases. + +Currently, the trials are hardcoded in the Python script. To add or modify a trial, you need to adjust `BASE_TRIALS` variable and the `process_trial` function. + +## Get mass fits for each trial + +File: `run-mlhep-fitter-multitrial.sh`
+Usage: `./run-mlhep-fitter-multitrial.sh` + +The `submission/analyzer.yml` config is used. +The script automates running MLHEP for each trial. Mass histograms are copied before each MLHEP invocation, so as only the quick fit steps needs to be activated in `submission/analyzer.yml` + +Adjust the variables before the `for` loop.
+The script includes also a call to `run-mlhep-fitter-multitrial.py`, which can be commented out. In this case, make sure to pass the same `OUT_DB_DIR`, `DB_PATTERN`, `DIR_PATTERN` values to the two scripts. + +Before running, you need to create the directory structure for each MLHEP output. You can, for example, run the `.sh` script with the `cp` lines commented out. Then, MLHEP creates directories for each trial and fails quietly. Next, run the script with `cp` lines uncommented, and you will get the final output. + +## Plot multitrial results + +Files: `multitrial.py`, `config_multitrial.json`
+Usage: `python3 multitrial.py config_multitrial.json` diff --git a/machine_learning_hep/multitrial.py b/machine_learning_hep/multitrial/multitrial.py similarity index 98% rename from machine_learning_hep/multitrial.py rename to machine_learning_hep/multitrial/multitrial.py index 38119c95fa..71194ce979 100644 --- a/machine_learning_hep/multitrial.py +++ b/machine_learning_hep/multitrial/multitrial.py @@ -68,7 +68,7 @@ def get_yields(cfg): def prepare_figure(cfg, y_label, ticks): fig = plt.figure(figsize=(20, 15)) ax = plt.subplot(1, 1, 1) - ax.set_xlabel(cfg["x_axis"], fontsize=20) + ax.set_xlabel("Trial #", fontsize=20) ax.set_ylabel(y_label, fontsize=20) ax.tick_params(which="both", width=2.5, direction="in") ax.tick_params(which="major", labelsize=20, length=15) @@ -98,7 +98,7 @@ def plot_trial_line(ax, central_trial_ind): def plot_yields_trials(yields, yields_err, trials, cfg, pt_string, plot_pt_string, central_trial_ind, central_yield): - fig, ax = prepare_figure(cfg, cfg["y_axis"], 100) + fig, ax = prepare_figure(cfg, "Raw yield", 100) x_axis = range(len(trials)) ax.errorbar(x_axis, yields, yerr=yields_err, fmt="o", c="b", elinewidth=2.5, linewidth=4.0) diff --git a/machine_learning_hep/run-mlhep-fitter-multitrial.py b/machine_learning_hep/multitrial/run-mlhep-fitter-multitrial.py similarity index 96% rename from machine_learning_hep/run-mlhep-fitter-multitrial.py rename to machine_learning_hep/multitrial/run-mlhep-fitter-multitrial.py index c5a06a1836..d8307def4e 100644 --- a/machine_learning_hep/run-mlhep-fitter-multitrial.py +++ b/machine_learning_hep/multitrial/run-mlhep-fitter-multitrial.py @@ -2,7 +2,7 @@ """ file: run-mlhep-fitter-multitrial.py brief: Prepare MLHEP database files for different fit configurations for multitrial systematics. -usage: python3 run-mlhep-fitter-multitrial.py +usage: python run-mlhep-fitter-multitrial.py database_lc data/data_run3 trial_configs_dir mlhep_results_dir_pattern author: Maja Karwowska , Warsaw University of Technology """ @@ -36,6 +36,8 @@ ["narrow", "narrow2", "wide", "wide2"] ) +DIR_PATH = "/data8/majak/MLHEP" + def generate_trials(trial_classes): combinations = [""] for trial_class in trial_classes: @@ -134,7 +136,7 @@ def main(db, db_dir, out_db_dir, resdir_pattern): data_cfg = [fit_params for fit_params in fit_cfg if not "level" in fit_params] resdir = f"{resdir_pattern}{comb}" - respath = f"/data8/majak/MLHEP/{resdir}/" + respath = f"{DIR_PATH}/{resdir}/" ana_cfg["data"]["prefix_dir_res"] = respath ana_cfg["mc"]["prefix_dir_res"] = respath diff --git a/machine_learning_hep/run-mlhep-fitter-multitrial.sh b/machine_learning_hep/multitrial/run-mlhep-fitter-multitrial.sh similarity index 83% rename from machine_learning_hep/run-mlhep-fitter-multitrial.sh rename to machine_learning_hep/multitrial/run-mlhep-fitter-multitrial.sh index 26f25972af..de173ee0f5 100644 --- a/machine_learning_hep/run-mlhep-fitter-multitrial.sh +++ b/machine_learning_hep/multitrial/run-mlhep-fitter-multitrial.sh @@ -8,7 +8,7 @@ ext=".yml" DIR_PATH="/data8/majak/MLHEP" DIR_PATTERN="results-24022025-newtrain-multitrial-prompt" # Prefix of output directory for fit results -# Paths to masshistos to fit +# Paths to input masshistos to fit BASE_DIR="/data8/majak/MLHEP/results-24022025-newtrain-ptshape-prompt" DATA_HIST="LHC23pp/Results/resultsdatatot/masshisto.root" MC_HIST="LHC24pp_mc/Results/resultsmctot/masshisto.root" @@ -37,11 +37,5 @@ for db in ${OUT_DB_DIR}/*.yml ; do -a Run3analysis \ --run-config submission/analyzer.yml \ --database-analysis ${db} - - # Copy the plots from MachineLearningHEP/machine_learning_hep/fig/ to RESPATH - # It's not compulsory, it's just for you to see the fits - # It might be obsolete if you changed the default output fig/ location in MLHEP - rm -rf ${RESPATH}/fig/ - mv fig/ ${RESPATH}/fig/ done From 7c9873afb9d5583883f2c4870828b01c750dd694 Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Tue, 15 Jul 2025 13:37:54 +0200 Subject: [PATCH 3/5] Linter fixes --- machine_learning_hep/multitrial/multitrial.py | 17 ++++++++--------- .../multitrial/run-mlhep-fitter-multitrial.sh | 14 +++++++------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/machine_learning_hep/multitrial/multitrial.py b/machine_learning_hep/multitrial/multitrial.py index 71194ce979..7d3c335a7a 100644 --- a/machine_learning_hep/multitrial/multitrial.py +++ b/machine_learning_hep/multitrial/multitrial.py @@ -51,7 +51,7 @@ def get_yields(cfg): trial_name = dirname.replace(cfg["dir_pattern"], "") for ind, (pt_bin_min, pt_bin_max) in enumerate(zip(cfg["pt_bins_min"], cfg["pt_bins_max"])): - if eval(cfg["selection"])(hist_sel.GetBinContent(ind + 1)) \ + if eval(cfg["selection"])(hist_sel.GetBinContent(ind + 1)) \ # pylint: disable=eval-used and hist.GetBinContent(ind + 1) > 1.0 : yields[f"{pt_bin_min}_{pt_bin_max}"].append(hist.GetBinContent(ind + 1)) yields_err[f"{pt_bin_min}_{pt_bin_max}"].append(hist.GetBinError(ind + 1)) @@ -65,7 +65,7 @@ def get_yields(cfg): return yields, yields_err, trials, chis -def prepare_figure(cfg, y_label, ticks): +def prepare_figure(y_label, ticks): fig = plt.figure(figsize=(20, 15)) ax = plt.subplot(1, 1, 1) ax.set_xlabel("Trial #", fontsize=20) @@ -79,10 +79,9 @@ def prepare_figure(cfg, y_label, ticks): return fig, ax -def set_ax_limits(ax, pt_string, values, errs): +def set_ax_limits(ax, pt_string, values): ax.margins(0.01, 0.2) np_values = np.array(values, dtype="float32") - np_errs = np.array(errs, dtype="float32") if ax.get_ylim()[1] - ax.get_ylim()[0] > 30.0 * np.std(np_values): ax.set_ylim(np.mean(np_values) - 10.0 * np.std(np_values), np.mean(np_values) + 10.0 * np.std(np_values)) @@ -98,11 +97,11 @@ def plot_trial_line(ax, central_trial_ind): def plot_yields_trials(yields, yields_err, trials, cfg, pt_string, plot_pt_string, central_trial_ind, central_yield): - fig, ax = prepare_figure(cfg, "Raw yield", 100) + fig, ax = prepare_figure("Raw yield", 100) x_axis = range(len(trials)) ax.errorbar(x_axis, yields, yerr=yields_err, fmt="o", c="b", elinewidth=2.5, linewidth=4.0) - set_ax_limits(ax, pt_string, yields, yields_err) + set_ax_limits(ax, pt_string, yields) central_line = np.array([central_yield] * len(x_axis), dtype="float32") ax.plot(x_axis, central_line, c="orange", ls="--", linewidth=4.0) central_err = np.array([yields_err[central_trial_ind]] * len(x_axis), dtype="float32") @@ -116,10 +115,10 @@ def plot_yields_trials(yields, yields_err, trials, cfg, pt_string, plot_pt_strin def plot_chis(chis, cfg, pt_string, plot_pt_string): - fig, ax = prepare_figure(cfg, "Chi2/ndf", 100) + fig, ax = prepare_figure("Chi2/ndf", 100) x_axis = range(len(chis)) ax.scatter(x_axis, chis, c="b", marker="o") - set_ax_limits(ax, pt_string, chis, [0.0] * len(chis)) + set_ax_limits(ax, pt_string, chis) plot_text_box(ax, plot_pt_string) fig.savefig(f'{cfg["outdir"]}/{cfg["outfile"]}_chis_{pt_string}.png', bbox_inches='tight') @@ -175,7 +174,7 @@ def main(): plot_yields_distr(yields[pt_string], cfg, pt_string, plot_pt_string, central_trial_ind, central_yield) plot_chis(chis[pt_string], cfg, pt_string, plot_pt_string) - except: + except: # pylint: disable=bare-except pass with open(f'{cfg["outdir"]}/{cfg["outfile"]}_trials_{pt_string}.txt', diff --git a/machine_learning_hep/multitrial/run-mlhep-fitter-multitrial.sh b/machine_learning_hep/multitrial/run-mlhep-fitter-multitrial.sh index de173ee0f5..4006d0790a 100644 --- a/machine_learning_hep/multitrial/run-mlhep-fitter-multitrial.sh +++ b/machine_learning_hep/multitrial/run-mlhep-fitter-multitrial.sh @@ -18,11 +18,11 @@ MC_HIST="LHC24pp_mc/Results/resultsmctot/masshisto.root" # The output analysis dir is set in databases to DIR_PATTERN + suffix with trial name python run-mlhep-fitter-multitrial.py "${DB_PATTERN}" "${DB_DIR}" "${OUT_DB_DIR}" "${DIR_PATTERN}" || exit 1 -for db in ${OUT_DB_DIR}/*.yml ; do - db_basename=`basename ${db}` - db_basename_no_ext=${db_basename%%${ext}} - echo ${db_basename_no_ext} - suffix=${db_basename_no_ext##${DB_PATTERN}} +for db in "${OUT_DB_DIR}"/*.yml ; do + db_basename=$(basename "${db}") + db_basename_no_ext=${db_basename%%"${ext}"} + echo "${db_basename_no_ext}" + suffix=${db_basename_no_ext##"${DB_PATTERN}"} echo "suffix: ${suffix}" RESPATH="${DIR_PATH}/${DIR_PATTERN}${suffix}" echo "respath: ${RESPATH}" @@ -33,9 +33,9 @@ for db in ${OUT_DB_DIR}/*.yml ; do cp "${BASE_DIR}/${DATA_HIST}" "${RESPATH}/${DATA_HIST}" cp "${BASE_DIR}/${MC_HIST}" "${RESPATH}/${MC_HIST}" - mlhep logfile_${db_basename}.log \ + mlhep "logfile_${db_basename}.log" \ -a Run3analysis \ --run-config submission/analyzer.yml \ - --database-analysis ${db} + --database-analysis "${db}" done From 0b46c9acb4c4dc1d51fa3996ddb2b0cba1770b8d Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Tue, 15 Jul 2025 13:44:59 +0200 Subject: [PATCH 4/5] Nitpick --- machine_learning_hep/multitrial/multitrial.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/machine_learning_hep/multitrial/multitrial.py b/machine_learning_hep/multitrial/multitrial.py index 7d3c335a7a..593a99dd31 100644 --- a/machine_learning_hep/multitrial/multitrial.py +++ b/machine_learning_hep/multitrial/multitrial.py @@ -51,8 +51,8 @@ def get_yields(cfg): trial_name = dirname.replace(cfg["dir_pattern"], "") for ind, (pt_bin_min, pt_bin_max) in enumerate(zip(cfg["pt_bins_min"], cfg["pt_bins_max"])): - if eval(cfg["selection"])(hist_sel.GetBinContent(ind + 1)) \ # pylint: disable=eval-used - and hist.GetBinContent(ind + 1) > 1.0 : + if eval(cfg["selection"])(hist_sel.GetBinContent(ind + 1)) \ + and hist.GetBinContent(ind + 1) > 1.0 : # pylint: disable=eval-used yields[f"{pt_bin_min}_{pt_bin_max}"].append(hist.GetBinContent(ind + 1)) yields_err[f"{pt_bin_min}_{pt_bin_max}"].append(hist.GetBinError(ind + 1)) trials[f"{pt_bin_min}_{pt_bin_max}"].append(trial_name) From ecef407788416a7de370ff2f2551235a6e5f3a41 Mon Sep 17 00:00:00 2001 From: saganatt <8majak8@gmail.com> Date: Thu, 17 Jul 2025 10:20:19 +0200 Subject: [PATCH 5/5] Add leftover JSON --- machine_learning_hep/multitrial/README.md | 2 ++ .../multitrial/config_multitrial.json | 20 +++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 machine_learning_hep/multitrial/config_multitrial.json diff --git a/machine_learning_hep/multitrial/README.md b/machine_learning_hep/multitrial/README.md index 93054c751c..8437924336 100644 --- a/machine_learning_hep/multitrial/README.md +++ b/machine_learning_hep/multitrial/README.md @@ -34,3 +34,5 @@ Before running, you need to create the directory structure for each MLHEP output Files: `multitrial.py`, `config_multitrial.json`
Usage: `python3 multitrial.py config_multitrial.json` + +Adjust the sample `config_multitrial.json` to your needs. diff --git a/machine_learning_hep/multitrial/config_multitrial.json b/machine_learning_hep/multitrial/config_multitrial.json new file mode 100644 index 0000000000..35aa25a32e --- /dev/null +++ b/machine_learning_hep/multitrial/config_multitrial.json @@ -0,0 +1,20 @@ +{ + "file_pattern": "/data8/majak/MLHEP/results-24022025-newtrain-multitrial-prompt*/LHC23pp_pass4/Results/resultsdatatot/yields_LcpKpi_Run3analysis.root", + "_file_pattern": "regex pattern for all multitrial fit files; note the asterisk to match all trial suffixes", + "dir_pattern": "results-24022025-newtrain-multitrial-prompt", + "_dir_pattern": "the base directory prefix from the file pattern above", + "histoname": "hyields0", + "_histoname": "histogram with mass fit", + "sel_histoname": "hchi0", + "_sel_histoname": "histogram for filtering the results", + "selection": "lambda x : x < 5.0", + "_selection": "filter to apply with sel_histoname, e.g., chi < 5", + "pt_bins_min": [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 16], + "pt_bins_max": [2, 3, 4, 5, 6, 7, 8, 10, 12, 16, 24], + "central_trial": "", + "_central_trial": "suffix of the directory with the central trial", + "outdir": "/data8/majak/multitrial", + "_outdir": "output directory", + "outfile": "result-prompt-chi5", + "_outfile": "output file pattern" +}