From 2d9f994c32182af02587e0a6f5c10ff61047a368 Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Thu, 21 Aug 2025 11:04:25 +0200
Subject: [PATCH 01/20] Update DB for D0 jets

---
 .../database_ml_parameters_D0Jet_pp.yml       | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
index 61bd6b388d..ce7a030b01 100644
--- a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
+++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
@@ -80,10 +80,10 @@ D0Jet_pp:
         extra:
           fY: log((sqrt(1.864**2 + (fPt * cosh(fEta))**2) + fPt * sinh(fEta)) / sqrt(1.864**2 + fPt**2)) #TODO : change mass or make sure Lc mass is updated
         tags:
-          ismcsignal: {var: fFlagMcMatchGen, req: [[0], []], abs: true}
-          ismcbkg: {var: fFlagMcMatchGen, req: [[], [0]], abs: true}
-          ismcprompt: {var: fOriginMcGen, req: [[0], [1]]}
-          ismcfd: {var: fOriginMcGen, req: [[1], [0]]}
+          ismcsignal: {var: fFlagMcMatchGen, req: 1, abs: true, level: mc}
+          ismcbkg: {var: ismcsignal, req: 0, level: mc}
+          ismcprompt: {var: fOriginMcGen, req: 1, level: mc}
+          ismcfd: {var: fOriginMcGen, req: 2, level: mc}
         filter: "fPt >= 1. and abs(fY) <= 0.8 and abs(fJetEta) < (.9 - (fJetR / 100.))" # TODO: check jet eta cut
 
       colldet:
@@ -110,12 +110,12 @@ D0Jet_pp:
           isd0: fFlagMcMatchRec == 1
           isd0bar: fFlagMcMatchRec == -1
         tags:
-          ismcsignal: {var: fFlagMcMatchRec, req: [[0], []], abs: True}
-          ismcbkg: {var: fFlagMcMatchRec, req: [[], [0]], abs: True}
-          seld0: {var: fCandidateSelFlag, req: [[0], []]}
-          seld0bar: {var: fCandidateSelFlag, req: [[1], []]}
-          ismcprompt: {var: fOriginMcRec, req: [[0], [1]]}
-          ismcfd: {var: fOriginMcRec, req: [[1], [0]]}
+          ismcsignal: {var: fFlagMcMatchRec, req: 1, abs: true, level: mc}
+          ismcbkg: {var: ismcsignal, req: 0, level: mc}
+          ismcprompt: {var: fOriginMcRec, req: 1, level: mc}
+          ismcfd: {var: fOriginMcRec, req: 2, level: mc}
+          seld0: {var: fCandidateSelFlag, req: 0, level: mc}
+          seld0bar: {var: fCandidateSelFlag, req: 1, level: mc}
         extract_component:
           - {var: fMlScores, newvar: mlBkgScore, component: 0}
         filter: "fPt >= 1. and abs(fY) <= 0.8 and abs(fJetEta) < (.9 - (fJetR / 100.))" # TODO: check jet eta cut

From 06744ed6e639dc32ab0be9b56a1b18beeb6aa6bf Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Thu, 6 Mar 2025 08:26:32 +0100
Subject: [PATCH 02/20] Restrict statistics for inclusive jets

---
 .../data/data_run3/database_ml_parameters_Jet_pp.yml          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_Jet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_Jet_pp.yml
index 358973d3bf..bcf26249c0 100644
--- a/machine_learning_hep/data/data_run3/database_ml_parameters_Jet_pp.yml
+++ b/machine_learning_hep/data/data_run3/database_ml_parameters_Jet_pp.yml
@@ -217,7 +217,7 @@ Jet_pp:
   multi:
     data:
       nprocessesparallel: 10
-      maxfiles: [-1] #list of periods
+      maxfiles: [60] #list of periods
       chunksizeunp: [100] #list of periods
       chunksizeskim: [100] #list of periods
       fracmerge: [.1] #list of periods
@@ -234,7 +234,7 @@ Jet_pp:
       mcreweights: [../Analyses] #list of periods
     mc:
       nprocessesparallel: 80
-      maxfiles: [-1] #list of periods
+      maxfiles: [80] #list of periods
       chunksizeunp: [100] #list of periods
       chunksizeskim: [1000] #list of periods
       fracmerge: [1.] #list of periods

From f3549a1afdd0b5a0c97e6c4c67b87f55682b2e2b Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Thu, 6 Mar 2025 11:27:55 +0100
Subject: [PATCH 03/20] Prepare for gen-only productions

---
 .../database_ml_parameters_D0Jet_pp.yml       | 105 +++++-----
 machine_learning_hep/multiprocesser.py        |  64 +++---
 machine_learning_hep/processer.py             | 101 ++++-----
 machine_learning_hep/processer_jet.py         |   6 +-
 machine_learning_hep/processerdhadrons.py     |   4 +-
 .../processerdhadrons_mult.py                 |   4 +-
 machine_learning_hep/steer_analysis.py        | 193 +++++++-----------
 7 files changed, 226 insertions(+), 251 deletions(-)

diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
index ce7a030b01..1fbd8ba83e 100644
--- a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
+++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
@@ -51,17 +51,17 @@ D0Jet_pp:
   #region dfs
   dfs:
     read:
-      evtorig:
-        index: fIndexHfD0McCollBases
-        trees:
-          O2hfd0collbase: [fNumContrib, fPosZ]
-        filter: "abs(fPosZ) < 10."
-      collcnt:
-        trees:
-          O2collcount: [fReadCounts, fReadCountsWithTVX, fReadCountsWithTVXAndZVertexAndSel8, fReadCountsWithTVXAndZVertexAndSelMC]
-      bccnt:
-        trees:
-          O2bccount: [fReadCountsWithTVX, fReadCountsWithTVXAndNoTFB, fReadCountsWithTVXAndNoTFBAndNoITSROFB]
+      # evtorig:
+      #   index: fIndexHfD0McCollBases
+      #   trees:
+      #     O2hfd0collbase: [fNumContrib, fPosZ]
+      #   filter: "abs(fPosZ) < 10."
+      # collcnt:
+      #   trees:
+      #     O2collcount: [fReadCounts, fReadCountsWithTVX, fReadCountsWithTVXAndZVertexAndSel8, fReadCountsWithTVXAndZVertexAndSelMC]
+      # bccnt:
+      #   trees:
+      #     O2bccount: [fReadCountsWithTVX, fReadCountsWithTVXAndNoTFB, fReadCountsWithTVXAndNoTFBAndNoITSROFB]
 
       collgen: # TODO: check if we can use the HF collision table instead
         level: gen
@@ -73,10 +73,10 @@ D0Jet_pp:
         level: gen
         index: fIndexD0CMCPJETOS
         trees:
-          O2hfd0pbase: [fIndexHfD0McCollBases, fPt, fEta, fPhi, fFlagMcMatchGen, fOriginMcGen]
+          O2hfd0pbase: [fIndexHFD0MCCOLLBASES, fPt, fEta, fPhi, fFlagMcMatchGen, fOriginMcGen]
           O2d0cmcpjeto: [fIndexD0CMCPJETCOS, fJetPt, fJetPhi, fJetEta, fJetNConstituents, fJetR]
-          O2d0cmcpjetmo: [fIndexArrayD0CMCDJETOS_hf, fIndexArrayD0CMCDJETOS_geo, fIndexArrayD0CMCDJETOS_pt]
-          O2d0cmcpjetsso: [fEnergyMother, fPtLeading, fPtSubLeading, fTheta, fNSub2DR, fNSub1, fNSub2, fAngularity, fPairPt, fPairEnergy, fPairTheta]
+          # O2d0cmcpjetmo: [fIndexArrayD0CMCDJETOS_hf, fIndexArrayD0CMCDJETOS_geo, fIndexArrayD0CMCDJETOS_pt]
+          O2d0cmcpjetsso: [fEnergyMother, fPtLeading, fPtSubLeading, fTheta, fNSub2DR, fNSub1, fNSub2, fAngularity, fPairJetPt, fPairJetEnergy, fPairJetTheta]
         extra:
           fY: log((sqrt(1.864**2 + (fPt * cosh(fEta))**2) + fPt * sinh(fEta)) / sqrt(1.864**2 + fPt**2)) #TODO : change mass or make sure Lc mass is updated
         tags:
@@ -145,10 +145,10 @@ D0Jet_pp:
           - {var: fMlScores, newvar: mlBkgScore, component: 0}
         filter: "fPt >= 1. and abs(fY) <= 0.8 and abs(fJetEta) < (.9 - (fJetR / 100.))" # TODO: check jet eta cut
 
-    merge:
-      - {base: jetgen, ref: evtorig}
-      - {base: jetdet, ref: colldet}
-      - {base: jetdata, ref: colldata}
+    # merge:
+    #   - {base: jetgen, ref: evtorig}
+    #   - {base: jetdet, ref: colldet}
+    #   - {base: jetdata, ref: colldata}
     # workaround for yamlfmt issue #110
 
     write:
@@ -161,19 +161,19 @@ D0Jet_pp:
       jetdata:
         level: data
         file: AnalysisResultsReco.parquet
-      evtorig:
-        level: all
-        file: AnalysisResultsEvtOrig.parquet
-      evt:
-        level: all
-        source: evtorig
-        file: AnalysisResultsEvt.parquet
-      collcnt:
-        level: all
-        file: AnalysisResultsCollCnt.parquet
-      bccnt:
-        level: all
-        file: AnalysisResultsBcCnt.parquet
+      # evtorig:
+      #   level: all
+      #   file: AnalysisResultsEvtOrig.parquet
+      # evt:
+      #   level: all
+      #   source: evtorig
+      #   file: AnalysisResultsEvt.parquet
+      # collcnt:
+      #   level: all
+      #   file: AnalysisResultsCollCnt.parquet
+      # bccnt:
+      #   level: all
+      #   file: AnalysisResultsBcCnt.parquet
 
   variables:
     var_all: [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPiExpPi, fNSigTpcKaExpPi, fNSigTpcPiExpKa, fNSigTpcKaExpKa]
@@ -242,20 +242,14 @@ D0Jet_pp:
   multi:
     data:
       nprocessesparallel: 80
-      maxfiles: [-1] #list of periods
+      maxfiles: [1] #list of periods
       chunksizeunp: [100] #list of periods
       chunksizeskim: [100] #list of periods
       fracmerge: [.1] #list of periods
       seedmerge: [12] #list of periods
       period: [LHC22o] #list of periods
       select_period: [1]
-      prefix_dir: /data2/MLhep/real/train_318624/ # full stats new
-      # prefix_dir: /data2/MLhep/real/train_257023/ # full stats
-      # prefix_dir: /data2/MLhep/real/train_240309/ # Andrea's model
-      # prefix_dir: /data2/MLhep/real/train_245508/ # pass7
-      # prefix_dir: /data2/MLhep/real/train_245509/ # no TOF
-      # prefix_dir: /data2/MLhep/real/train_245510/ # no PID
-      # prefix_dir: /data2/MLhep/real/train_245511/ # full PID
+      prefix_dir: /data2/MLhep/real/train_318624/
       unmerged_tree_dir: [/alice/] #list of periods
       pkl: ["${USER}/d0jet/pkl"] #list of periods
       pkl_skimmed: ["${USER}/d0jet/pklsk"] #list of periods
@@ -265,20 +259,14 @@ D0Jet_pp:
       mcreweights: [../Analyses] #list of periods
     mc:
       nprocessesparallel: 80
-      maxfiles: [-1] #list of periods
+      maxfiles: [1] #list of periods
       chunksizeunp: [100] #list of periods
       chunksizeskim: [1000] #list of periods
       fracmerge: [1.] #list of periods
       seedmerge: [12] #list of periods
       period: [LHC24d3b] #list of periods
       select_period: [1]
-      prefix_dir: /data2/MLhep/sim/train_318983/ # full stats new
-      # prefix_dir: /data2/MLhep/sim/train_257228/ # full stats
-      # prefix_dir: /data2/MLhep/sim/train_259454/ # 97 % tracking eff
-      # prefix_dir: /data2/MLhep/sim/train_239774/ # Andrea's model
-      # prefix_dir: /data2/MLhep/sim/train_244893/ # no TOF
-      # prefix_dir: /data2/MLhep/sim/train_244894/ # no PID
-      # prefix_dir: /data2/MLhep/sim/train_244895/ # full PID
+      prefix_dir: /data2/MLhep/sim/train_318983/
       unmerged_tree_dir: [alice/]
       pkl: ["${USER}/d0jet/pkl"] #list of periods
       pkl_skimmed: ["${USER}/d0jet/pklsk"] #list of periods
@@ -286,6 +274,23 @@ D0Jet_pp:
       pkl_skimmed_merge_for_ml_all: "${USER}/d0jet/pp_mc_prod_mltot"
       pkl_evtcounter_all: "${USER}/d0jet/pp_mc_prod_evttot"
       mcreweights: [../Analyses] #list of periods
+    fd:
+      nprocessesparallel: 80
+      maxfiles: [1] #list of periods
+      chunksizeunp: [100] #list of periods
+      chunksizeskim: [1000] #list of periods
+      fracmerge: [1.] #list of periods
+      seedmerge: [12] #list of periods
+      period: [LHC24d3b] #list of periods
+      select_period: [1]
+      prefix_dir: /home/jklein/mlhep_test/powheg/
+      unmerged_tree_dir: [d0-default/]
+      pkl: ["${USER}/d0jet/pkl"] #list of periods
+      pkl_skimmed: ["${USER}/d0jet/pklsk"] #list of periods
+      pkl_skimmed_merge_for_ml: ["${USER}/d0jet/pklskml"] #list of periods
+      pkl_skimmed_merge_for_ml_all: "${USER}/d0jet/pp_mc_prod_mltot"
+      pkl_evtcounter_all: "${USER}/d0jet/pp_mc_prod_evttot"
+      mcreweights: [../Analyses] #list of periods
 
   ml:
     evtsel: null # TODO: fIsEventReject == 0
@@ -769,10 +774,16 @@ D0Jet_pp:
         runselection: [null] #FIXME # used but useless
         results: ["/home/${USER}/mlhep/d0jet/jet_obs/default/default/mc/results"] #list of periods
         resultsallp: "/home/${USER}/mlhep/d0jet/jet_obs/default/default/mc/results_all"
+      fd: &fd_out_default
+        runselection: [null] #FIXME # used but useless
+        results: ["/home/${USER}/mlhep/d0jet/jet_obs/default/default/fd/results"] #list of periods
+        resultsallp: "/home/${USER}/mlhep/d0jet/jet_obs/default/default/fd/results_all"
       data_proc: # alternative processor output used as the analyzer input
         <<: *data_out_default
       mc_proc: # alternative processor output used as the analyzer input
         <<: *mc_out_default
+      fd_proc: # alternative processor output used as the analyzer input
+        <<: *fd_out_default
 
       # simple fitter START # used in cplusutilities/mass_fitter.C
       # sgnfunc: [0,0,0,0,0,0,0,0,0,0,0,0] # kGaus=0, k2Gaus=1, k2GausSigmaRatioPar=2 (sel_an_binmin bins)
diff --git a/machine_learning_hep/multiprocesser.py b/machine_learning_hep/multiprocesser.py
index 9fd4048378..ccb4fa3c06 100755
--- a/machine_learning_hep/multiprocesser.py
+++ b/machine_learning_hep/multiprocesser.py
@@ -16,33 +16,44 @@
 main script for doing data processing, machine learning and analysis
 """
 
+from functools import reduce
 import os
 import tempfile
+from typing import TypeVar
 
 from machine_learning_hep.io_ml_utils import dump_yaml_from_dict, parse_yaml
 from machine_learning_hep.logger import get_logger
 from machine_learning_hep.utilities import merge_method, mergerootfiles
-
+from .common import DataType
 
 class MultiProcesser:  # pylint: disable=too-many-instance-attributes, too-many-statements, consider-using-f-string, too-many-branches
     species = "multiprocesser"
     logger = get_logger()
 
-    def __init__(self, case, proc_class, datap, typean, run_param, mcordata):
+    T = TypeVar("T")
+
+    def cfg(self, param: str, default: T = None) -> T:
+        return reduce(
+            lambda d, key: d.get(key, default) if isinstance(d, dict) else default,
+            param.split("."),
+            self.datap,
+        )
+
+    def __init__(self, case, proc_class, datap, typean, run_param, datatype):
         self.case = case
         self.datap = datap
         self.typean = typean
         self.run_param = run_param
-        self.mcordata = mcordata
-        self.prodnumber = len(datap["multi"][self.mcordata]["unmerged_tree_dir"])
-        self.p_period = datap["multi"][self.mcordata]["period"]
-        self.select_period = datap["multi"][self.mcordata]["select_period"]
-        self.p_seedmerge = datap["multi"][self.mcordata]["seedmerge"]
-        self.p_fracmerge = datap["multi"][self.mcordata]["fracmerge"]
-        self.p_maxfiles = datap["multi"][self.mcordata]["maxfiles"]
-        self.p_chunksizeunp = datap["multi"][self.mcordata]["chunksizeunp"]
-        self.p_chunksizeskim = datap["multi"][self.mcordata]["chunksizeskim"]
-        self.p_nparall = datap["multi"][self.mcordata]["nprocessesparallel"]
+        self.datatype = DataType(datatype)
+        self.prodnumber = len(datap["multi"][self.datatype.value]["unmerged_tree_dir"])
+        self.p_period = datap["multi"][self.datatype.value]["period"]
+        self.select_period = datap["multi"][self.datatype.value]["select_period"]
+        self.p_seedmerge = datap["multi"][self.datatype.value]["seedmerge"]
+        self.p_fracmerge = datap["multi"][self.datatype.value]["fracmerge"]
+        self.p_maxfiles = datap["multi"][self.datatype.value]["maxfiles"]
+        self.p_chunksizeunp = datap["multi"][self.datatype.value]["chunksizeunp"]
+        self.p_chunksizeskim = datap["multi"][self.datatype.value]["chunksizeskim"]
+        self.p_nparall = datap["multi"][self.datatype.value]["nprocessesparallel"]
         self.lpt_anbinmin = datap["sel_skim_binmin"]
         self.lpt_anbinmax = datap["sel_skim_binmax"]
         self.p_nptbins = len(datap["sel_skim_binmax"])
@@ -53,18 +64,19 @@ def __init__(self, case, proc_class, datap, typean, run_param, mcordata):
         self.dlper_pkl = []
         self.dlper_pklsk = []
         self.dlper_pklml = []
-        self.d_prefix = datap["multi"][self.mcordata].get("prefix_dir", "")
-        self.d_prefix_app = datap["mlapplication"][self.mcordata].get("prefix_dir_app", "")
-        self.d_prefix_res = datap["analysis"][self.typean][self.mcordata].get("prefix_dir_res", "")
+        self.d_prefix = datap["multi"][self.datatype.value].get("prefix_dir", "")
+        self.d_prefix_app = self.cfg(f"mlapplication.{self.datatype.value}.prefix_dir_app", "")
+        self.d_prefix_res = self.cfg(f"analysis.{self.typean}.{self.datatype.value}.prefix_dir_res", "")
 
-        dp = datap["multi"][self.mcordata]
+        dp = datap["multi"][self.datatype.value]
         self.dlper_root = [self.d_prefix + os.path.expandvars(p) for p in dp["unmerged_tree_dir"]]
+        print('****', self.dlper_root, flush=True)
         self.dlper_pkl = [self.d_prefix + os.path.expandvars(p) for p in dp["pkl"]]
         self.dlper_pklsk = [self.d_prefix + os.path.expandvars(p) for p in dp["pkl_skimmed"]]
         self.dlper_pklml = [self.d_prefix + os.path.expandvars(p) for p in dp["pkl_skimmed_merge_for_ml"]]
         self.d_pklml_mergedallp = self.d_prefix + os.path.expandvars(dp["pkl_skimmed_merge_for_ml_all"])
         self.d_pklevt_mergedallp = self.d_prefix + os.path.expandvars(dp["pkl_evtcounter_all"])
-        self.dlper_mcreweights = datap["multi"][self.mcordata]["mcreweights"]
+        self.dlper_mcreweights = datap["multi"][self.datatype.value]["mcreweights"]
 
         # namefiles pkl
         self.v_var_binning = datap["var_binning"]
@@ -99,21 +111,21 @@ def __init__(self, case, proc_class, datap, typean, run_param, mcordata):
         self.lper_evt = [os.path.join(direc, self.n_evt) for direc in self.dlper_pkl]
         self.lper_evtorig = [os.path.join(direc, self.n_evtorig) for direc in self.dlper_pkl]
 
-        dp = datap["mlapplication"][self.mcordata]
-        self.dlper_reco_modapp = [self.d_prefix_app + p for p in dp["pkl_skimmed_dec"]]
-        self.dlper_reco_modappmerged = [self.d_prefix_app + p for p in dp["pkl_skimmed_decmerged"]]
+        dp = self.cfg(f"mlapplication.{self.datatype.value}", {})
+        self.dlper_reco_modapp = [self.d_prefix_app + p for p in dp["pkl_skimmed_dec"]] if dp else [None] * len(self.p_period)
+        self.dlper_reco_modappmerged = [self.d_prefix_app + p for p in dp["pkl_skimmed_decmerged"]] if dp else [None] * len(self.p_period)
 
-        dp = datap["analysis"][self.typean][self.mcordata]
+        dp = self.cfg(f"analysis.{self.typean}.{self.datatype.value}", {})
         self.d_results = [self.d_prefix_res + os.path.expandvars(p) for p in dp["results"]]
         self.d_resultsallp = self.d_prefix_res + os.path.expandvars(dp["resultsallp"])
 
         self.f_evt_mergedallp = os.path.join(self.d_pklevt_mergedallp, self.n_evt)
         self.f_evtorig_mergedallp = os.path.join(self.d_pklevt_mergedallp, self.n_evtorig)
 
-        self.lper_runlistrigger = datap["analysis"][self.typean][self.mcordata]["runselection"]
+        self.lper_runlistrigger = self.cfg(f"analysis.{self.typean}.{self.datatype.value}.runselection", [None] * len(self.p_period))
 
         self.lper_mcreweights = None
-        if self.mcordata == "mc":
+        if self.datatype == DataType.MC:
             self.lper_mcreweights = [os.path.join(direc, self.n_mcreweights) for direc in self.dlper_mcreweights]
 
         self.process_listsample = []
@@ -123,7 +135,7 @@ def __init__(self, case, proc_class, datap, typean, run_param, mcordata):
                     self.case,
                     self.datap,
                     self.run_param,
-                    self.mcordata,
+                    self.datatype.value,
                     self.p_maxfiles[indexp],
                     self.dlper_root[indexp],
                     self.dlper_pkl[indexp],
@@ -186,10 +198,10 @@ def multi_mergeml_allinone(self):
             for indexp in range(self.prodnumber):
                 if self.select_period[indexp] == 0:
                     self.lptper_recoml[ipt].remove(self.lptper_recoml[ipt][indexp])
-                    if self.mcordata == "mc":
+                    if self.datatype == DataType.MC:
                         self.lptper_genml[ipt].remove(self.lptper_genml[ipt][indexp])
             merge_method(self.lptper_recoml[ipt], self.lpt_recoml_mergedallp[ipt])
-            if self.mcordata == "mc":
+            if self.datatype == DataType.MC:
                 merge_method(self.lptper_genml[ipt], self.lpt_genml_mergedallp[ipt])
 
         count_evt = 0
diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py
index b2766d3cbd..192d3b0ae8 100644
--- a/machine_learning_hep/processer.py
+++ b/machine_learning_hep/processer.py
@@ -88,13 +88,14 @@ def __init__(
         # directories
         self.d_prefix_ml = datap["ml"].get("prefix_dir_ml", "")
         self.d_root = d_root
+        print('%%%%', self.d_root, flush=True)
         self.d_pkl = d_pkl
         self.d_pklsk = d_pklsk
         self.d_pkl_ml = d_pkl_ml
         self.d_results = d_results
         self.d_mcreweights = d_mcreweights  # used in hadrons
         self.datap = datap
-        self.mcordata = mcordata
+        self.datatype = mcordata
 
         self.lpt_anbinmin = datap["sel_skim_binmin"]
         self.lpt_anbinmax = datap["sel_skim_binmax"]
@@ -177,6 +178,7 @@ def __init__(
 
         # list of files names
         if os.path.isdir(self.d_root):
+            print(f"{self.d_root=}", flush=True)
             self.l_path = list_folders(self.d_root, self.n_root, self.p_maxfiles, self.select_jobs)
         elif glob.glob(f"{self.d_pkl}/**/{self.n_reco}", recursive=True):
             self.l_path = list_folders(self.d_pkl, self.n_reco, self.p_maxfiles, self.select_jobs)
@@ -185,6 +187,7 @@ def __init__(
                 ".p", "_%s%d_%d.p" % (self.v_var_binning, self.lpt_anbinmin[0], self.lpt_anbinmax[0])
             )
             self.l_path = list_folders(self.d_pklsk, self.n_sk, self.p_maxfiles, self.select_jobs)
+        print(f"{self.l_path=}", flush=True)
 
         self.l_root = createlist(self.d_root, self.l_path, self.n_root)
         self.l_reco = createlist(self.d_pkl, self.l_path, self.n_reco)
@@ -196,7 +199,7 @@ def __init__(
         self.l_histoeff = createlist(self.d_results, self.l_path, self.n_fileeff)
         # self.l_historesp = createlist(self.d_results, self.l_path, self.n_fileresp)
 
-        if self.mcordata == "mc":
+        if self.datatype == "mc":
             self.l_gen = createlist(self.d_pkl, self.l_path, self.n_gen)
             if self.n_gen_sl:
                 self.l_gen_sl = createlist(self.d_pkl, self.l_path, self.n_gen_sl)
@@ -223,41 +226,36 @@ def __init__(
             for bin in self.bins_analysis
         ]
 
-        self.lpt_probcutpre = datap["mlapplication"]["probcutpresel"][self.mcordata]
-        lpt_probcutfin_tmp = datap["mlapplication"]["probcutoptimal"]
+        self.lpt_probcutpre = self.cfg_global(f"mlapplication.probcutpresel.{self.datatype}", [None] * self.p_nptbins)
+        lpt_probcutfin_tmp = self.cfg_global(f"mlapplication.probcutoptimal", [None] * self.p_nptfinbins)
         self.lpt_probcutfin = [lpt_probcutfin_tmp[bin_matching[ibin]] for ibin in range(self.p_nptfinbins)]
 
-        for ibin, probcutfin in enumerate(self.lpt_probcutfin):
-            probcutpre = self.lpt_probcutpre[bin_matching[ibin]]
-            if self.mltype == "MultiClassification":
-                if probcutfin[0] > probcutpre[0] or probcutfin[1] < probcutpre[1] or probcutfin[2] < probcutpre[2]:
+        if self.datatype in ('mc', 'data'):
+            for ibin, probcutfin in enumerate(self.lpt_probcutfin):
+                probcutpre = self.lpt_probcutpre[bin_matching[ibin]]
+                if self.mltype == "MultiClassification":
+                    if probcutfin[0] > probcutpre[0] or probcutfin[1] < probcutpre[1] or probcutfin[2] < probcutpre[2]:
+                        self.logger.fatal(
+                            "Probability cut final: %s must be tighter than presel %s!\n"
+                            "Verify that bkg prob presel > final, and other cuts presel < final",
+                            self.lpt_probcutfin,
+                            self.lpt_probcutpre,
+                        )
+                elif probcutfin < probcutpre:
                     self.logger.fatal(
-                        "Probability cut final: %s must be tighter than presel %s!\n"
-                        "Verify that bkg prob presel > final, and other cuts presel < final",
+                        "Probability cut final: %s must be tighter (smaller values) than presel %s!",
                         self.lpt_probcutfin,
                         self.lpt_probcutpre,
                     )
-            elif probcutfin < probcutpre:
-                self.logger.fatal(
-                    "Probability cut final: %s must be tighter (smaller values) than presel %s!",
-                    self.lpt_probcutfin,
-                    self.lpt_probcutpre,
-                )
 
-        if self.mltype == "MultiClassification":
-            self.l_selml = []
-            comps = ["<=", ">=", ">="]
-            for ipt in range(self.p_nptfinbins):
-                mlsel_multi = [
-                    f"y_test_prob{self.p_modelname}{label.replace('-', '_')} {comp} {probcut}"
-                    for label, comp, probcut in zip(self.class_labels, comps, self.lpt_probcutfin[ipt], strict=False)
+            if self.mltype == "MultiClassification":
+                self.l_selml = []
+                comps = ["<=", ">=", ">="]
+                for ipt in range(self.p_nptfinbins):
+                    mlsel_multi = [
+                        f"y_test_prob{self.p_modelname}{label.replace('-', '_')} {comp} {probcut}"
+                        for label, comp, probcut in zip(self.class_labels, comps, self.lpt_probcutfin[ipt], strict=False)
                 ]
-                self.l_selml.append(" and ".join(mlsel_multi))
-
-        else:
-            self.l_selml = [
-                f"y_test_prob{self.p_modelname} > {self.lpt_probcutfin[ipt]}" for ipt in range(self.p_nptfinbins)
-            ]
 
         self.d_pkl_dec = d_pkl_dec
         self.mptfiles_recosk = []
@@ -265,6 +263,7 @@ def __init__(
         self.mptfiles_gensk_sl = []
 
         self.d_pkl_decmerged = d_pkl_decmerged
+        print(self.d_results, self.n_filemass, flush=True)
         self.n_filemass = os.path.join(self.d_results, self.n_filemass)
         self.n_fileeff = os.path.join(self.d_results, self.n_fileeff)
         self.n_fileresp = os.path.join(self.d_results, self.n_fileresp)
@@ -293,7 +292,7 @@ def __init__(
         )
 
         self.lpt_recodec = None
-        if self.doml is True:
+        if self.doml and self.datatype in ('mc', 'data'):
             if self.mltype == "MultiClassification":
                 self.lpt_recodec = [
                     self.n_reco.replace(
@@ -327,11 +326,11 @@ def __init__(
         ]
         self.mptfiles_recoskmldec = [
             createlist(self.d_pkl_dec, self.l_path, self.lpt_recodec[ipt]) for ipt in range(self.p_nptbins)
-        ]
+        ] if self.datatype in ('mc', 'data') else []
         self.lpt_recodecmerged = [
             os.path.join(self.d_pkl_decmerged, self.lpt_recodec[ipt]) for ipt in range(self.p_nptbins)
-        ]
-        if self.mcordata == "mc":
+        ] if self.datatype in ('mc', 'data') else []
+        if self.datatype == "mc":
             self.mptfiles_gensk = [
                 createlist(self.d_pklsk, self.l_path, self.lpt_gensk[ipt]) for ipt in range(self.p_nptbins)
             ]
@@ -365,6 +364,13 @@ def cfg(self, param: str, default: T = None) -> T:
             self.datap["analysis"][self.typean],
         )
 
+    def cfg_global(self, param: str, default: T = None) -> T:
+        return reduce(
+            lambda d, key: d.get(key, default) if isinstance(d, dict) else default,
+            param.split("."),
+            self.datap,
+        )
+
     def unpack(self, file_index, max_no_keys=None):  # pylint: disable=too-many-branches, too-many-locals
         def dfread(rdir, trees, cols, idx_name=None):
             """Read DF from multiple (joinable) O2 tables"""
@@ -411,8 +417,9 @@ def dfuse(df_spec):
             level = df_spec.get("level", "all")
             return (
                 (level == "all")
-                or (level in ("mc", "gen", "det") and self.mcordata == "mc")
-                or (level in ("data") and self.mcordata == "data")
+                or (level in ("mc", "gen", "det") and self.datatype == "mc")
+                or (level in ("mc", "gen") and self.datatype == "fd")
+                or (level in ("data") and self.datatype == "data")
             )
 
         self.logger.info("unpacking: %s", self.l_root[file_index])
@@ -533,8 +540,8 @@ def dfuse(df_spec):
 
     def skim(self, file_index):
         dfreco = read_df(self.l_reco[file_index])
-        dfgen = read_df(self.l_gen[file_index]) if self.mcordata == "mc" else None
-        dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.n_gen_sl and self.mcordata == "mc" else None
+        dfgen = read_df(self.l_gen[file_index]) if self.datatype == "mc" else None
+        dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.n_gen_sl and self.datatype == "mc" else None
 
         for ipt in range(self.p_nptbins):
             dfrecosk = seldf_singlevar(dfreco, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt])
@@ -606,14 +613,14 @@ def parallelizer(self, function, argument_list, maxperchunk):
                 #     pass
 
     def process_unpack_par(self):
-        self.logger.info("Unpacking %s period %s", self.mcordata, self.period)
+        self.logger.info("Unpacking %s period %s", self.datatype, self.period)
         create_folder_struc(self.d_pkl, self.l_path)
         arguments = [(i,) for i in range(len(self.l_root))]
-        self.logger.debug("d_pkl: %s, l_path: %s, arguments: %s", self.d_pkl, str(self.l_path), str(arguments))
+        self.logger.info("d_pkl: %s, l_path: %s, arguments: %s", self.d_pkl, str(self.l_path), str(arguments))
         self.parallelizer(self.unpack, arguments, self.p_chunksizeunp)
 
     def process_skim_par(self):
-        self.logger.info("Skimming %s period %s", self.mcordata, self.period)
+        self.logger.info("Skimming %s period %s", self.datatype, self.period)
         create_folder_struc(self.d_pklsk, self.l_path)
         arguments = [(i,) for i in range(len(self.l_reco))]
         self.parallelizer(self.skim, arguments, self.p_chunksizeskim)
@@ -622,13 +629,13 @@ def process_skim_par(self):
             merge_method(self.l_evtorig, self.f_totevtorig)
 
     def process_applymodel_par(self):
-        self.logger.info("Applying model to %s period %s", self.mcordata, self.period)
+        self.logger.info("Applying model to %s period %s", self.datatype, self.period)
         create_folder_struc(self.d_pkl_dec, self.l_path)
         arguments = [(i,) for i in range(len(self.mptfiles_recosk[0]))]
         self.parallelizer(self.applymodel, arguments, self.p_chunksizeskim)
 
     def process_mergeforml(self):
-        self.logger.info("doing merging for ml %s %s", self.mcordata, self.period)
+        self.logger.info("doing merging for ml %s %s", self.datatype, self.period)
         indices_for_evt = []
         for ipt in range(self.p_nptbins):
             nfiles = len(self.mptfiles_recosk[ipt])
@@ -642,7 +649,7 @@ def process_mergeforml(self):
             indices_for_evt = list(set(indices_for_evt) | set(filesel))
             list_sel_recosk = [self.mptfiles_recosk[ipt][j] for j in filesel]
             merge_method(list_sel_recosk, self.lpt_reco_ml[ipt])
-            if self.mcordata == "mc":
+            if self.datatype == "mc":
                 list_sel_gensk = [self.mptfiles_gensk[ipt][j] for j in filesel]
                 merge_method(list_sel_gensk, self.lpt_gen_ml[ipt])
 
@@ -655,7 +662,7 @@ def process_mergeforml(self):
     def process_mergedec(self):
         for ipt in range(self.p_nptbins):
             merge_method(self.mptfiles_recoskmldec[ipt], self.lpt_recodecmerged[ipt])
-            if self.mcordata == "mc":
+            if self.datatype == "mc":
                 merge_method(self.mptfiles_gensk[ipt], self.lpt_gendecmerged[ipt])
 
     def load_cuts(self):
@@ -709,14 +716,14 @@ def apply_cut_for_ipt(df_full, ipt: int):
 
             if any(self.analysis_cuts):
                 df_ipt = df_ipt.query(self.analysis_cuts[ipt]) if in_range else df_ipt
-            if any(self.analysis_mult_cuts) and self.mcordata == "data":
+            if any(self.analysis_mult_cuts) and self.datatype == "data":
                 df_ipt = df_ipt.query(self.analysis_mult_cuts[ipt]) if in_range else df_ipt
             return df_ipt
 
         return pd.concat(apply_cut_for_ipt(df_, ipt) for ipt in range(-1, self.p_nptfinbins + 1))
 
     def process_histomass(self):
-        self.logger.debug("Doing masshisto %s %s", self.mcordata, self.period)
+        self.logger.debug("Doing masshisto %s %s", self.datatype, self.period)
         self.logger.debug("Using run selection for mass histo %s %s %s", self.runlistrigger, "for period", self.period)
         if self.doml is True:
             self.logger.debug("Doing ml analysis")
@@ -735,7 +742,7 @@ def process_histomass(self):
             mergerootfiles(self.l_histomass, self.n_filemass, tmp_merged_dir)
 
     def process_efficiency(self):
-        print("Doing efficiencies", self.mcordata, self.period)
+        print("Doing efficiencies", self.datatype, self.period)
         print("Using run selection for eff histo", self.runlistrigger, "for period", self.period)
         if self.doml is True:
             print("Doing ml analysis")
diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py
index 5efabe2361..74bfb9b92f 100644
--- a/machine_learning_hep/processer_jet.py
+++ b/machine_learning_hep/processer_jet.py
@@ -237,7 +237,7 @@ def process_histomass_single(self, index):
             histonorm.SetBinContent(1, len(dfquery(dfevtorig, self.s_evtsel)))
             if self.l_collcnt:
                 dfcollcnt = read_df(self.l_collcnt[index])
-                ser_collcnt = dfcollcnt[self.cfg(f"counter_read_{self.mcordata}")]
+                ser_collcnt = dfcollcnt[self.cfg(f"counter_read_{self.datatype}")]
                 collcnt_read = functools.reduce(lambda x, y: float(x) + float(y), (ar[0] for ar in ser_collcnt))
                 self.logger.info("sampled %g collisions", collcnt_read)
                 histonorm.SetBinContent(2, collcnt_read)
@@ -277,7 +277,7 @@ def process_histomass_single(self, index):
             fill_hist(h, df[["fM", "fJetPt", "fPt"]], write=True)
 
             for sel_name, sel_spec in self.cfg("data_selections", {}).items():
-                if sel_spec["level"] == self.mcordata:
+                if sel_spec["level"] == self.datatype:
                     df_sel = dfquery(df, sel_spec["query"])
                     h = create_hist(
                         f"h_mass-ptjet-pthf_{sel_name}",
@@ -288,7 +288,7 @@ def process_histomass_single(self, index):
                     )
                     fill_hist(h, df_sel[["fM", "fJetPt", "fPt"]], write=True)
 
-            if self.mcordata == "mc":
+            if self.datatype == "mc":
                 df, _ = self.split_df(df, self.cfg("frac_mcana", 0.2))
                 if len(df) == 0:
                     return
diff --git a/machine_learning_hep/processerdhadrons.py b/machine_learning_hep/processerdhadrons.py
index a46e90ee37..1180ee0c2e 100755
--- a/machine_learning_hep/processerdhadrons.py
+++ b/machine_learning_hep/processerdhadrons.py
@@ -170,7 +170,7 @@ def process_histomass_single(self, index):
             myfile.cd()
             h_invmass.Write()
 
-            if self.mcordata == "mc":
+            if self.datatype == "mc":
                 df_sig = df[df[self.v_ismcsignal] == 1]
                 df_bkg = df[df[self.v_ismcbkg] == 1]
                 h_invmass_sig = TH1F(
@@ -188,7 +188,7 @@ def process_histomass_single(self, index):
                 h_invmass_bkg.Write()
 
         for sel_name, sel_spec in self.cfg("data_selections", {}).items():
-            if sel_spec["level"] == self.mcordata:
+            if sel_spec["level"] == self.datatype:
                 df_sel = dfquery(df_ptmerged, sel_spec["query"])
                 h = create_hist(
                     f"h_mass-pthf_{sel_name}",
diff --git a/machine_learning_hep/processerdhadrons_mult.py b/machine_learning_hep/processerdhadrons_mult.py
index c0d060a9c0..670863dc72 100755
--- a/machine_learning_hep/processerdhadrons_mult.py
+++ b/machine_learning_hep/processerdhadrons_mult.py
@@ -245,7 +245,7 @@ def process_histomass_single(self, index):
                 myfile.cd()
                 h_invmass.Write()
 
-                if self.mcordata == "mc":
+                if self.datatype == "mc":
                     df_bin_sig = df_bin[df_bin[self.v_ismcsignal] == 1]
                     h_invmass_sig = TH1F(
                         "hmass_sig" + suffix, "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]
@@ -461,7 +461,7 @@ def set_content(df_to_use, histogram, i_b=ibin2, b_c=bincounter):
             h_list = []
 
     def process_efficiency(self):
-        print("Doing efficiencies", self.mcordata, self.period)
+        print("Doing efficiencies", self.datatype, self.period)
         if self.doml is True:
             print("Doing ml analysis")
         else:
diff --git a/machine_learning_hep/steer_analysis.py b/machine_learning_hep/steer_analysis.py
index 6958a2461e..fe8d0e3e3e 100644
--- a/machine_learning_hep/steer_analysis.py
+++ b/machine_learning_hep/steer_analysis.py
@@ -28,6 +28,7 @@
 
 from .analysis.analyzer_manager import AnalyzerManager
 from .config import update_config
+from .common import DataType
 from .logger import configure_logger, get_logger
 from .utilities_files import checkdirs, checkmakedir, checkmakedirlist, delete_dirlist
 
@@ -92,21 +93,27 @@ def do_entire_analysis(  # pylint: disable=too-many-locals, too-many-statements,
     doanaperperiod = data_config["analysis"]["doperperiod"]
     typean = data_config["analysis"]["type"]
 
-    dp = data_param[case]["multi"]["mc"]
-    dirprefixmc = dp.get("prefix_dir", "")
-    dirpklmc = [dirprefixmc + os.path.expandvars(p) for p in dp["pkl"]]
-    dirpklskmc = [dirprefixmc + os.path.expandvars(p) for p in dp["pkl_skimmed"]]
-    dirpklmlmc = [dirprefixmc + os.path.expandvars(p) for p in dp["pkl_skimmed_merge_for_ml"]]
-    dirpklevtcounter_allmc = dirprefixmc + os.path.expandvars(dp["pkl_evtcounter_all"])
-    dirpklmltotmc = dirprefixmc + os.path.expandvars(dp["pkl_skimmed_merge_for_ml_all"])
-
-    dp = data_param[case]["multi"]["data"]
-    dirprefixdata = dp.get("prefix_dir", "")
-    dirpkldata = [dirprefixdata + os.path.expandvars(p) for p in dp["pkl"]]
-    dirpklskdata = [dirprefixdata + os.path.expandvars(p) for p in dp["pkl_skimmed"]]
-    dirpklmldata = [dirprefixdata + os.path.expandvars(p) for p in dp["pkl_skimmed_merge_for_ml"]]
-    dirpklevtcounter_alldata = dirprefixdata + os.path.expandvars(dp["pkl_evtcounter_all"])
-    dirpklmltotdata = dirprefixdata + os.path.expandvars(dp["pkl_skimmed_merge_for_ml_all"])
+    dirpkl = {}
+    dirpklsk = {}
+    dirpklml = {}
+    dirpklevtcounter_all = {}
+    dirpklmltot = {}
+    dirpklmltotmc = {}
+    dirresults = {}
+    dirresultstot = {}
+    for dt in DataType:
+        dp = data_param[case]["multi"][dt.value]
+        dirprefix = dp.get("prefix_dir", "")
+        dirpkl[dt] = [dirprefix + os.path.expandvars(p) for p in dp["pkl"]]
+        dirpklsk[dt] = [dirprefix + os.path.expandvars(p) for p in dp["pkl_skimmed"]]
+        dirpklml[dt] = [dirprefix + os.path.expandvars(p) for p in dp["pkl_skimmed_merge_for_ml"]]
+        dirpklevtcounter_all[dt] = dirprefix + os.path.expandvars(dp["pkl_evtcounter_all"])
+        dirpklmltot[dt] = dirprefix + os.path.expandvars(dp["pkl_skimmed_merge_for_ml_all"])
+
+        dp = data_param[case]["analysis"][typean][dt.value]
+        dirprefixres = dp.get("prefix_dir_res", "")
+        dirresults[dt] = [dirprefixres + os.path.expandvars(p) for p in dp["results"]]
+        dirresultstot[dt] = dirprefixres + os.path.expandvars(dp["resultsallp"])
 
     dp = data_param[case]["mlapplication"]["mc"]
     dirprefixmcapp = dp.get("prefix_dir_app", "")
@@ -118,16 +125,6 @@ def do_entire_analysis(  # pylint: disable=too-many-locals, too-many-statements,
     dirpklskdecdata = [dirprefixdataapp + p for p in dp["pkl_skimmed_dec"]]
     dirpklskdec_mergeddata = [dirprefixdataapp + p for p in dp["pkl_skimmed_decmerged"]]
 
-    dp = data_param[case]["analysis"][typean]["data"]
-    dirprefixdatares = dp.get("prefix_dir_res", "")
-    dirresultsdata = [dirprefixdatares + os.path.expandvars(p) for p in dp["results"]]
-    dirresultsdatatot = dirprefixdatares + os.path.expandvars(dp["resultsallp"])
-
-    dp = data_param[case]["analysis"][typean]["mc"]
-    dirprefixmcres = dp.get("prefix_dir_res", "")
-    dirresultsmc = [dirprefixmcres + os.path.expandvars(p) for p in dp["results"]]
-    dirresultsmctot = dirprefixmcres + os.path.expandvars(dp["resultsallp"])
-
     binminarray = data_param[case]["ml"]["binmin"]
     binmaxarray = data_param[case]["ml"]["binmax"]
     multbkg = data_param[case]["ml"]["mult_bkg"]
@@ -142,31 +139,19 @@ def do_entire_analysis(  # pylint: disable=too-many-locals, too-many-statements,
     proc_type = data_param[case]["analysis"][typean]["proc_type"]
 
     exdirs = []
-    if doconversionmc:
-        exdirs.extend(checkdirs(dirpklmc))
-
-    if doconversiondata:
-        exdirs.extend(checkdirs(dirpkldata))
-
-    if doskimmingmc:
-        exdirs.extend(checkdirs(dirpklskmc))
-        exdirs.extend(checkdirs(dirpklevtcounter_allmc))
-
-    if doskimmingdata:
-        exdirs.extend(checkdirs(dirpklskdata))
-        exdirs.extend(checkdirs(dirpklevtcounter_alldata))
-
-    if domergingmc:
-        exdirs.extend(checkdirs(dirpklmlmc))
-
-    if domergingdata:
-        exdirs.extend(checkdirs(dirpklmldata))
-
-    if domergingperiodsmc:
-        exdirs.extend(checkdirs(dirpklmltotmc))
-
-    if domergingperiodsdata:
-        exdirs.extend(checkdirs(dirpklmltotdata))
+    for dt in DataType:
+        if data_config["conversion"][dt.value]["activate"]:
+            exdirs.extend(checkdirs(dirpkl[dt]))
+        if data_config["skimming"][dt.value]["activate"]:
+            exdirs.extend(checkdirs(dirpklsk[dt]))
+            exdirs.extend(checkdirs(dirpklevtcounter_all[dt]))
+        if data_config["merging"][dt.value]["activate"]:
+            exdirs.extend(checkdirs(dirpklml[dt]))
+        if data_config["mergingperiods"][dt.value]["activate"]:
+            exdirs.extend(checkdirs(dirpklmltot[dt]))
+        if data_config["analysis"][dt.value]["histomass"]:
+            exdirs.extend(checkdirs(dirresults[dt]))
+            exdirs.extend(checkdirs(dirresultstot[dt]))
 
     if not docontinueapplymc:
         if doapplymc:
@@ -182,14 +167,6 @@ def do_entire_analysis(  # pylint: disable=too-many-locals, too-many-statements,
         if domergeapplydata:
             exdirs.extend(checkdirs(dirpklskdec_mergeddata))
 
-    if dohistomassmc:
-        exdirs.extend(checkdirs(dirresultsmc))
-        exdirs.extend(checkdirs(dirresultsmctot))
-
-    if dohistomassdata:
-        exdirs.extend(checkdirs(dirresultsdata))
-        exdirs.extend(checkdirs(dirresultsdatatot))
-
     if len(exdirs) > 0:
         logger.info("existing directories must be deleted")
         for d in exdirs:
@@ -208,31 +185,21 @@ def do_entire_analysis(  # pylint: disable=too-many-locals, too-many-statements,
             sys.exit()
 
     # check and create directories
-    if doconversionmc:
-        checkmakedirlist(dirpklmc)
-
-    if doconversiondata:
-        checkmakedirlist(dirpkldata)
-
-    if doskimmingmc:
-        checkmakedirlist(dirpklskmc)
-        checkmakedir(dirpklevtcounter_allmc)
-
-    if doskimmingdata:
-        checkmakedirlist(dirpklskdata)
-        checkmakedir(dirpklevtcounter_alldata)
-
-    if domergingmc:
-        checkmakedirlist(dirpklmlmc)
-
-    if domergingdata:
-        checkmakedirlist(dirpklmldata)
-
-    if domergingperiodsmc:
-        checkmakedir(dirpklmltotmc)
-
-    if domergingperiodsdata:
-        checkmakedir(dirpklmltotdata)
+    for dt in DataType:
+        if data_config["conversion"][dt.value]["activate"]:
+            checkmakedirlist(dirpkl[dt])
+        if data_config["skimming"][dt.value]["activate"]:
+            checkmakedirlist(dirpklsk[dt])
+            checkmakedir(dirpklevtcounter_all[dt])
+            exdirs.extend(checkdirs(dirpklsk[dt]))
+            exdirs.extend(checkdirs(dirpklevtcounter_all[dt]))
+        if data_config["merging"][dt.value]["activate"]:
+            checkmakedirlist(dirpklml[dt])
+        if data_config["mergingperiods"][dt.value]["activate"]:
+            checkmakedir(dirpklmltot[dt])
+        if data_config["analysis"][dt.value]["histomass"]:
+            checkmakedirlist(dirresults[dt])
+            checkmakedir(dirresultstot[dt])
 
     if doml:
         checkmakedir(mlout)
@@ -252,12 +219,6 @@ def do_entire_analysis(  # pylint: disable=too-many-locals, too-many-statements,
         if domergeapplydata:
             checkmakedirlist(dirpklskdec_mergeddata)
 
-    # Always create result directories. (produces "double free or corruption (!prev)")
-    checkmakedirlist(dirresultsmc)
-    checkmakedir(dirresultsmctot)
-    checkmakedirlist(dirresultsdata)
-    checkmakedir(dirresultsdatatot)
-
     def mlhepmod(name):
         return importlib.import_module(f"..{name}", __name__)
 
@@ -286,8 +247,12 @@ def mlhepmod(name):
         proc_class = mlhepmod("processer").Processer
         ana_class = mlhepmod("analysis.analyzer").Analyzer
 
-    mymultiprocessmc = MultiProcesser(case, proc_class, data_param[case], typean, run_param, "mc")
-    mymultiprocessdata = MultiProcesser(case, proc_class, data_param[case], typean, run_param, "data")
+    multiprocessor = {}
+    for dt in DataType:
+        multiprocessor[dt] = MultiProcesser(case, proc_class, data_param[case], typean, run_param, dt)
+    mymultiprocessmc = multiprocessor[DataType.MC]
+    mymultiprocessdata = multiprocessor[DataType.DATA]
+    mymultiprocessfd = multiprocessor[DataType.FD]
 
     ana_mgr = AnalyzerManager(ana_class, data_param[case], case, typean, doanaperperiod)
 
@@ -308,29 +273,15 @@ def mlhepmod(name):
     if dodownloadalice:
         subprocess.call("../cplusutilities/Download.sh")
 
-    if doconversionmc:
-        mymultiprocessmc.multi_unpack_allperiods()
-
-    if doconversiondata:
-        mymultiprocessdata.multi_unpack_allperiods()
-
-    if doskimmingmc:
-        mymultiprocessmc.multi_skim_allperiods()
-
-    if doskimmingdata:
-        mymultiprocessdata.multi_skim_allperiods()
-
-    if domergingmc:
-        mymultiprocessmc.multi_mergeml_allperiods()
-
-    if domergingdata:
-        mymultiprocessdata.multi_mergeml_allperiods()
-
-    if domergingperiodsmc:
-        mymultiprocessmc.multi_mergeml_allinone()
-
-    if domergingperiodsdata:
-        mymultiprocessdata.multi_mergeml_allinone()
+    for dt in DataType:
+        if data_config["conversion"][dt.value]["activate"]:
+            multiprocessor[dt].multi_unpack_allperiods()
+        if data_config["skimming"][dt.value]["activate"]:
+            multiprocessor[dt].multi_skim_allperiods()
+        if data_config["merging"][dt.value]["activate"]:
+            multiprocessor[dt].multi_mergeml_allperiods()
+        if data_config["mergingperiods"][dt.value]["activate"]:
+            multiprocessor[dt].multi_mergeml_allinone()
 
     if doml:
         from machine_learning_hep.optimiser import Optimiser  # pylint: disable=import-outside-toplevel
@@ -392,16 +343,10 @@ def mlhepmod(name):
     if domergeapplymc:
         mymultiprocessmc.multi_mergeapply_allperiods()
 
-    if dohistomassmc:
-        mymultiprocessmc.multi_histomass()
-    if dohistomassdata:
-        # After-burner in case of a mult analysis to obtain "correctionsweight.root"
-        # for merged-period data
-        # pylint: disable=fixme
-        # FIXME Can only be run here because result directories are constructed when histomass
-        #       is run. If this step was independent, histomass would always complain that the
-        #       result directory already exists.
-        mymultiprocessdata.multi_histomass()
+    for dt in DataType:
+        if data_config["analysis"][dt.value]["histomass"]:
+            multiprocessor[dt].multi_histomass()
+
     if doefficiency:
         mymultiprocessmc.multi_efficiency()
     analyze_steps = []

From 4d2d8940a4495a4d473a528bbebdf74f8b3bb76f Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Thu, 6 Mar 2025 14:03:09 +0100
Subject: [PATCH 04/20] Establish conversion for gen-only

---
 machine_learning_hep/common.py                |  6 ++
 .../database_ml_parameters_D0Jet_pp.yml       | 74 +++++++++----------
 machine_learning_hep/processer.py             |  9 ++-
 machine_learning_hep/processer_jet.py         |  4 +-
 4 files changed, 50 insertions(+), 43 deletions(-)
 create mode 100644 machine_learning_hep/common.py

diff --git a/machine_learning_hep/common.py b/machine_learning_hep/common.py
new file mode 100644
index 0000000000..15ac2719b0
--- /dev/null
+++ b/machine_learning_hep/common.py
@@ -0,0 +1,6 @@
+from enum import Enum
+
+class DataType(Enum):
+    MC = "mc"
+    DATA = "data"
+    FD = "fd"
diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
index 1fbd8ba83e..d53fb59a33 100644
--- a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
+++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
@@ -76,7 +76,7 @@ D0Jet_pp:
           O2hfd0pbase: [fIndexHFD0MCCOLLBASES, fPt, fEta, fPhi, fFlagMcMatchGen, fOriginMcGen]
           O2d0cmcpjeto: [fIndexD0CMCPJETCOS, fJetPt, fJetPhi, fJetEta, fJetNConstituents, fJetR]
           # O2d0cmcpjetmo: [fIndexArrayD0CMCDJETOS_hf, fIndexArrayD0CMCDJETOS_geo, fIndexArrayD0CMCDJETOS_pt]
-          O2d0cmcpjetsso: [fEnergyMother, fPtLeading, fPtSubLeading, fTheta, fNSub2DR, fNSub1, fNSub2, fAngularity, fPairJetPt, fPairJetEnergy, fPairJetTheta]
+          O2d0cmcpjetsso: [fEnergyMother, fPtLeading, fPtSubLeading, fTheta, fNSub2DR, fNSub1, fNSub2, fAngularity]
         extra:
           fY: log((sqrt(1.864**2 + (fPt * cosh(fEta))**2) + fPt * sinh(fEta)) / sqrt(1.864**2 + fPt**2)) #TODO : change mass or make sure Lc mass is updated
         tags:
@@ -104,7 +104,7 @@ D0Jet_pp:
           O2hfd0ml: [fMlScores]
           O2d0cmcdjeto: [fIndexD0CMCDJETCOS, fJetPt, fJetPhi, fJetEta, fJetNConstituents, fJetR]
           O2d0cmcdjetmo: [fIndexArrayD0CMCPJETOS_hf, fIndexArrayD0CMCPJETOS_geo, fIndexArrayD0CMCPJETOS_pt]
-          O2d0cmcdjetsso: [fEnergyMother, fPtLeading, fPtSubLeading, fTheta, fNSub2DR, fNSub1, fNSub2, fAngularity, fPairPt, fPairEnergy, fPairTheta]
+          O2d0cmcdjetsso: [fEnergyMother, fPtLeading, fPtSubLeading, fTheta, fNSub2DR, fNSub1, fNSub2, fAngularity]
         extra:
           fY: log((sqrt(1.864**2 + (fPt * cosh(fEta))**2) + fPt * sinh(fEta)) / sqrt(1.864**2 + fPt**2)) #TODO : change mass or make sure Lc mass is updated
           isd0: fFlagMcMatchRec == 1
@@ -138,7 +138,7 @@ D0Jet_pp:
           O2hfd0ml: [fMlScores]
           O2hfd0sel: [fCandidateSelFlag]
           O2d0cjeto: [fIndexD0CJETCOS, fJetPt, fJetPhi, fJetEta, fJetNConstituents, fJetR]
-          O2d0cjetsso: [fEnergyMother, fPtLeading, fPtSubLeading, fTheta, fNSub2DR, fNSub1, fNSub2, fAngularity, fPairPt, fPairEnergy, fPairTheta]
+          O2d0cjetsso: [fEnergyMother, fPtLeading, fPtSubLeading, fTheta, fNSub2DR, fNSub1, fNSub2, fAngularity]
         extra:
           fY: log((sqrt(1.864**2 + (fPt * cosh(fEta))**2) + fPt * sinh(fEta)) / sqrt(1.864**2 + fPt**2)) #TODO : change mass or make sure Lc mass is updated
         extract_component:
@@ -411,40 +411,40 @@ D0Jet_pp:
           bins_gen_fix: [10, 0., 1.]
           bins_det_fix: [10, 0., 1.]
           label: "#Delta#it{r}"
-        lntheta:
-          bins_gen_fix: [8, 1., 5.]
-          bins_det_fix: [8, 1., 5.]
-          label: "#minusln(#it{#theta})"
-          arraycols: [3]
-        lnkt:
-          bins_gen_fix: [8, -4., 4.]
-          bins_det_fix: [8, -4., 4.]
-          label: "ln(#it{k}_{T}/(GeV/#it{c}))"
-          arraycols: [3]
-        lntheta-lnkt:
-          arraycols: [3, 4]
-        # new variables
-        fEnergyMother:
-          bins_gen_fix: [1, 0., 100.]
-          bins_det_fix: [1, 0., 100.]
-          arraycols: [3]
-        # lntheta-lnkt-fEnergyMother:
-        #     arraycols: [3, 4, 5]
-        fJetNConstituents:
-          bins_gen_fix: [5, 0., 20.]
-          bins_det_fix: [5, 0., 20.]
-        zpar-fJetNConstituents: {}
-        nsub21:
-          # TODO: check for 1-track jets
-          bins_gen_fix: [11, -1., 1.]
-          bins_det_fix: [11, -1., 1.]
-        eecweight:
-          # TODO: adjust binning
-          bins_gen_fix: [10, 0., 1.]
-          bins_det_fix: [10, 0., 1.]
-          arraycols: [3]
-        fPairTheta-eecweight:
-          arraycols: [3, 4]
+        # lntheta:
+        #   bins_gen_fix: [8, 1., 5.]
+        #   bins_det_fix: [8, 1., 5.]
+        #   label: "#minusln(#it{#theta})"
+        #   arraycols: [3]
+        # lnkt:
+        #   bins_gen_fix: [8, -4., 4.]
+        #   bins_det_fix: [8, -4., 4.]
+        #   label: "ln(#it{k}_{T}/(GeV/#it{c}))"
+        #   arraycols: [3]
+        # lntheta-lnkt:
+        #   arraycols: [3, 4]
+        # # new variables
+        # fEnergyMother:
+        #   bins_gen_fix: [1, 0., 100.]
+        #   bins_det_fix: [1, 0., 100.]
+        #   arraycols: [3]
+        # # lntheta-lnkt-fEnergyMother:
+        # #     arraycols: [3, 4, 5]
+        # fJetNConstituents:
+        #   bins_gen_fix: [5, 0., 20.]
+        #   bins_det_fix: [5, 0., 20.]
+        # zpar-fJetNConstituents: {}
+        # nsub21:
+        #   # TODO: check for 1-track jets
+        #   bins_gen_fix: [11, -1., 1.]
+        #   bins_det_fix: [11, -1., 1.]
+        # eecweight:
+        #   # TODO: adjust binning
+        #   bins_gen_fix: [10, 0., 1.]
+        #   bins_det_fix: [10, 0., 1.]
+        #   arraycols: [3]
+        # fPairTheta-eecweight:
+        #   arraycols: [3, 4]
 
       data_selections:
         mcsig:
diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py
index 192d3b0ae8..6815ef7b8f 100644
--- a/machine_learning_hep/processer.py
+++ b/machine_learning_hep/processer.py
@@ -539,14 +539,15 @@ def dfuse(df_spec):
                     write_df(dfo, path)
 
     def skim(self, file_index):
-        dfreco = read_df(self.l_reco[file_index])
+        dfreco = read_df(self.l_reco[file_index]) if self.datatype != "fd" else None
         dfgen = read_df(self.l_gen[file_index]) if self.datatype == "mc" else None
         dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.n_gen_sl and self.datatype == "mc" else None
 
         for ipt in range(self.p_nptbins):
-            dfrecosk = seldf_singlevar(dfreco, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt])
-            dfrecosk = dfquery(dfrecosk, self.s_reco_skim[ipt])
-            write_df(dfrecosk, self.mptfiles_recosk[ipt][file_index])
+            if dfreco is not None:
+                dfrecosk = seldf_singlevar(dfreco, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt])
+                dfrecosk = dfquery(dfrecosk, self.s_reco_skim[ipt])
+                write_df(dfrecosk, self.mptfiles_recosk[ipt][file_index])
 
             if dfgen is not None:
                 dfgensk = seldf_singlevar(dfgen, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt])
diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py
index 74bfb9b92f..221fe962d9 100644
--- a/machine_learning_hep/processer_jet.py
+++ b/machine_learning_hep/processer_jet.py
@@ -433,8 +433,8 @@ def process_efficiency_single(self, index):
                     "fNSub2",
                     "fJetNConstituents",
                     "fEnergyMother",
-                    "fPairTheta",
-                    "fPairPt",
+                    # "fPairTheta",
+                    # "fPairPt",
                 ]
             )
             cols = None

From 5557f5036300216243147e35db995a3416a791ab Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Thu, 6 Mar 2025 14:43:04 +0100
Subject: [PATCH 05/20] Establish masshisto for gen-only production

---
 .../database_ml_parameters_D0Jet_pp.yml       | 22 ++++----
 machine_learning_hep/multiprocesser.py        |  2 +-
 machine_learning_hep/processer.py             | 20 +++++---
 machine_learning_hep/processer_jet.py         | 10 +++-
 machine_learning_hep/steer_analysis.py        | 50 +++++++++----------
 machine_learning_hep/submission/processor.yml |  8 +--
 6 files changed, 63 insertions(+), 49 deletions(-)

diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
index d53fb59a33..2971c24933 100644
--- a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
+++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
@@ -79,6 +79,7 @@ D0Jet_pp:
           O2d0cmcpjetsso: [fEnergyMother, fPtLeading, fPtSubLeading, fTheta, fNSub2DR, fNSub1, fNSub2, fAngularity]
         extra:
           fY: log((sqrt(1.864**2 + (fPt * cosh(fEta))**2) + fPt * sinh(fEta)) / sqrt(1.864**2 + fPt**2)) #TODO : change mass or make sure Lc mass is updated
+          fM: 1.864
         tags:
           ismcsignal: {var: fFlagMcMatchGen, req: 1, abs: true, level: mc}
           ismcbkg: {var: ismcsignal, req: 0, level: mc}
@@ -161,13 +162,14 @@ D0Jet_pp:
       jetdata:
         level: data
         file: AnalysisResultsReco.parquet
-      # evtorig:
-      #   level: all
-      #   file: AnalysisResultsEvtOrig.parquet
-      # evt:
-      #   level: all
-      #   source: evtorig
-      #   file: AnalysisResultsEvt.parquet
+      evtorig:
+        level: all
+        source: collgen
+        file: AnalysisResultsEvtOrig.parquet
+      evt:
+        level: all
+        source: collgen
+        file: AnalysisResultsEvt.parquet
       # collcnt:
       #   level: all
       #   file: AnalysisResultsCollCnt.parquet
@@ -221,10 +223,10 @@ D0Jet_pp:
 
   files_names:
     namefile_unmerged_tree: AO2D.root
-    namefile_reco: AnalysisResultsReco.parquet
+    # namefile_reco: AnalysisResultsReco.parquet
     namefile_evt: AnalysisResultsEvt.parquet
-    namefile_collcnt: AnalysisResultsCollCnt.parquet
-    namefile_bccnt: AnalysisResultsBcCnt.parquet
+    # namefile_collcnt: AnalysisResultsCollCnt.parquet
+    # namefile_bccnt: AnalysisResultsBcCnt.parquet
     namefile_evtvalroot: AnalysisResultsROOTEvtVal.root
     namefile_evtorig: AnalysisResultsEvtOrig.parquet
     namefile_gen: AnalysisResultsGen.parquet
diff --git a/machine_learning_hep/multiprocesser.py b/machine_learning_hep/multiprocesser.py
index ccb4fa3c06..034431b43a 100755
--- a/machine_learning_hep/multiprocesser.py
+++ b/machine_learning_hep/multiprocesser.py
@@ -80,7 +80,7 @@ def __init__(self, case, proc_class, datap, typean, run_param, datatype):
 
         # namefiles pkl
         self.v_var_binning = datap["var_binning"]
-        self.n_reco = datap["files_names"]["namefile_reco"]
+        self.n_reco = datap["files_names"].get("namefile_reco", "")
         self.n_evt = datap["files_names"]["namefile_evt"]
         self.n_evtorig = datap["files_names"]["namefile_evtorig"]
         self.n_evt_count_ml = datap["files_names"].get("namefile_evt_count", "evtcount.yaml")
diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py
index 6815ef7b8f..5a68ad1143 100644
--- a/machine_learning_hep/processer.py
+++ b/machine_learning_hep/processer.py
@@ -138,9 +138,9 @@ def __init__(
         # def nget(d : dict, k : list, dd = None):
         #     return nget(d.get(k.pop(0), {}), k, dd) if len(k) > 1 else d.get(k.pop(0), dd)
         # nget(datap, ['dfs', 'write', 'jetsubdet', 'file'])
-        self.n_reco = datap["files_names"]["namefile_reco"]
+        self.n_reco = datap["files_names"].get("namefile_reco", "")
         self.n_evt = datap["files_names"]["namefile_evt"]
-        self.n_collcnt = datap["files_names"]["namefile_collcnt"]
+        self.n_collcnt = datap["files_names"].get("namefile_collcnt")
         self.n_bccnt = datap["files_names"].get("namefile_bccnt")
         self.n_evtorig = datap["files_names"].get("namefile_evtorig")
         self.n_evt_count_ml = datap["files_names"].get("namefile_evt_count", "evtcount.yaml")
@@ -191,6 +191,7 @@ def __init__(
 
         self.l_root = createlist(self.d_root, self.l_path, self.n_root)
         self.l_reco = createlist(self.d_pkl, self.l_path, self.n_reco)
+        self.l_gen = createlist(self.d_pkl, self.l_path, self.n_gen)
         self.l_evt = createlist(self.d_pkl, self.l_path, self.n_evt)
         self.l_evtorig = createlist(self.d_pkl, self.l_path, self.n_evtorig)
         self.l_collcnt = createlist(self.d_pkl, self.l_path, self.n_collcnt)
@@ -323,20 +324,20 @@ def __init__(
 
         self.mptfiles_recosk = [
             createlist(self.d_pklsk, self.l_path, self.lpt_recosk[ipt]) for ipt in range(self.p_nptbins)
-        ]
+        ] if self.datatype in ('mc', 'data') else []
         self.mptfiles_recoskmldec = [
             createlist(self.d_pkl_dec, self.l_path, self.lpt_recodec[ipt]) for ipt in range(self.p_nptbins)
         ] if self.datatype in ('mc', 'data') else []
         self.lpt_recodecmerged = [
             os.path.join(self.d_pkl_decmerged, self.lpt_recodec[ipt]) for ipt in range(self.p_nptbins)
         ] if self.datatype in ('mc', 'data') else []
-        if self.datatype == "mc":
+        if self.datatype in ('mc', 'fd'):
             self.mptfiles_gensk = [
                 createlist(self.d_pklsk, self.l_path, self.lpt_gensk[ipt]) for ipt in range(self.p_nptbins)
             ]
             self.lpt_gendecmerged = [
                 os.path.join(self.d_pkl_decmerged, self.lpt_gensk[ipt]) for ipt in range(self.p_nptbins)
-            ]
+            ] if self.d_pkl_decmerged else []
             self.mptfiles_gensk_sl = (
                 [createlist(self.d_pklsk, self.l_path, self.lpt_gensk_sl[ipt]) for ipt in range(self.p_nptbins)]
                 if self.lpt_gensk_sl
@@ -539,9 +540,11 @@ def dfuse(df_spec):
                     write_df(dfo, path)
 
     def skim(self, file_index):
+        print("//////////////// skimming ///////////")
         dfreco = read_df(self.l_reco[file_index]) if self.datatype != "fd" else None
-        dfgen = read_df(self.l_gen[file_index]) if self.datatype == "mc" else None
-        dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.n_gen_sl and self.datatype == "mc" else None
+        dfgen = read_df(self.l_gen[file_index]) if self.datatype in ('mc', 'fd') else None
+        dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.n_gen_sl and self.datatype in ('mc', 'fd') else None
+        print(dfreco, dfgen, dfgen_sl, flush=True)
 
         for ipt in range(self.p_nptbins):
             if dfreco is not None:
@@ -552,6 +555,7 @@ def skim(self, file_index):
             if dfgen is not None:
                 dfgensk = seldf_singlevar(dfgen, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt])
                 dfgensk = dfquery(dfgensk, self.s_gen_skim[ipt])
+                print(self.mptfiles_gensk, flush=True)
                 write_df(dfgensk, self.mptfiles_gensk[ipt][file_index])
 
             if dfgen_sl is not None:
@@ -623,7 +627,7 @@ def process_unpack_par(self):
     def process_skim_par(self):
         self.logger.info("Skimming %s period %s", self.datatype, self.period)
         create_folder_struc(self.d_pklsk, self.l_path)
-        arguments = [(i,) for i in range(len(self.l_reco))]
+        arguments = [(i,) for i in range(len(self.l_gen))]
         self.parallelizer(self.skim, arguments, self.p_chunksizeskim)
         if self.p_dofullevtmerge is True:
             merge_method(self.l_evt, self.f_totevt)
diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py
index 221fe962d9..153e6d875e 100644
--- a/machine_learning_hep/processer_jet.py
+++ b/machine_learning_hep/processer_jet.py
@@ -255,13 +255,19 @@ def process_histomass_single(self, index):
             get_axis(histonorm, 0).SetBinLabel(4, "N_{BC}^{TVX}")
             histonorm.Write()
 
-            df = pd.concat(read_df(self.mptfiles_recosk[bin][index]) for bin in self.active_bins_skim)
+            print(f"{self.mptfiles_recosk=}", flush=True)
+            print(f"{self.mptfiles_gensk=}", flush=True)
+            if self.datatype != 'fd':
+                df = pd.concat(read_df(self.mptfiles_recosk[bin][index]) for bin in self.active_bins_skim)
+            else:
+                df = pd.concat(read_df(self.mptfiles_gensk[bin][index]) for bin in self.active_bins_skim)
             # remove entries outside of kinematic range (should be taken care of by projections in analyzer)
             df = df.loc[(df.fJetPt >= min(self.binarray_ptjet)) & (df.fJetPt < max(self.binarray_ptjet))]
             df = df.loc[(df.fPt >= min(self.bins_analysis[:, 0])) & (df.fPt < max(self.bins_analysis[:, 1]))]
 
             # Custom skimming cuts
-            df = self.apply_cuts_all_ptbins(df)
+            if self.datatype != "fd":
+                df = self.apply_cuts_all_ptbins(df)
 
             if col_evtidx := self.cfg("cand_collidx"):
                 h = create_hist("h_ncand", ";N_{cand}", 20, 0.0, 20.0)
diff --git a/machine_learning_hep/steer_analysis.py b/machine_learning_hep/steer_analysis.py
index fe8d0e3e3e..05f2ce276b 100644
--- a/machine_learning_hep/steer_analysis.py
+++ b/machine_learning_hep/steer_analysis.py
@@ -15,11 +15,13 @@
 """
 
 import argparse
+from functools import reduce
 import importlib
 import os
 import shutil
 import subprocess
 import sys
+from typing import TypeVar
 
 # unclear why shap needs to be imported from here,
 # segfaults when imported from within other modules
@@ -33,6 +35,14 @@
 from .utilities_files import checkdirs, checkmakedir, checkmakedirlist, delete_dirlist
 
 
+T = TypeVar("T")
+def cfg(config: dict, param: str, default: T = None) -> T:
+    return reduce(
+        lambda d, key: d.get(key, default) if isinstance(d, dict) else default,
+        param.split("."),
+        config,
+    )
+
 def do_entire_analysis(  # pylint: disable=too-many-locals, too-many-statements, too-many-branches
     data_config: dict,
     data_param: dict,
@@ -51,14 +61,6 @@ def do_entire_analysis(  # pylint: disable=too-many-locals, too-many-statements,
     update_config(data_param, data_config, data_param_overwrite)
 
     dodownloadalice = data_config["download"]["alice"]["activate"]
-    doconversionmc = data_config["conversion"]["mc"]["activate"]
-    doconversiondata = data_config["conversion"]["data"]["activate"]
-    domergingmc = data_config["merging"]["mc"]["activate"]
-    domergingdata = data_config["merging"]["data"]["activate"]
-    doskimmingmc = data_config["skimming"]["mc"]["activate"]
-    doskimmingdata = data_config["skimming"]["data"]["activate"]
-    domergingperiodsmc = data_config["mergingperiods"]["mc"]["activate"]
-    domergingperiodsdata = data_config["mergingperiods"]["data"]["activate"]
     doml = data_config["ml_study"]["activate"]
     docorrelation = data_config["ml_study"]["docorrelation"]
     dotraining = data_config["ml_study"]["dotraining"]
@@ -83,8 +85,6 @@ def do_entire_analysis(  # pylint: disable=too-many-locals, too-many-statements,
     domergeapplymc = data_config["mlapplication"]["mc"]["domergeapply"]
     docontinueapplydata = data_config["mlapplication"]["data"]["docontinueafterstop"]
     docontinueapplymc = data_config["mlapplication"]["mc"]["docontinueafterstop"]
-    dohistomassmc = data_config["analysis"]["mc"]["histomass"]
-    dohistomassdata = data_config["analysis"]["data"]["histomass"]
     doefficiency = data_config["analysis"]["mc"]["efficiency"]
     efficiency_resp = data_config["analysis"]["mc"].get("efficiency_resp", False)
     do_syst_ml = data_config["systematics"]["cutvar"]["activate"]
@@ -140,16 +140,16 @@ def do_entire_analysis(  # pylint: disable=too-many-locals, too-many-statements,
 
     exdirs = []
     for dt in DataType:
-        if data_config["conversion"][dt.value]["activate"]:
+        if cfg(data_config, f"conversion.{dt.value}.activate"):
             exdirs.extend(checkdirs(dirpkl[dt]))
-        if data_config["skimming"][dt.value]["activate"]:
+        if cfg(data_config, f"skimming.{dt.value}.activate"):
             exdirs.extend(checkdirs(dirpklsk[dt]))
             exdirs.extend(checkdirs(dirpklevtcounter_all[dt]))
-        if data_config["merging"][dt.value]["activate"]:
+        if cfg(data_config, f"merging.{dt.value}.activate"):
             exdirs.extend(checkdirs(dirpklml[dt]))
-        if data_config["mergingperiods"][dt.value]["activate"]:
+        if cfg(data_config, f"mergingperiods.{dt.value}.activate"):
             exdirs.extend(checkdirs(dirpklmltot[dt]))
-        if data_config["analysis"][dt.value]["histomass"]:
+        if cfg(data_config, f"analysis.{dt.value}.histomass"):
             exdirs.extend(checkdirs(dirresults[dt]))
             exdirs.extend(checkdirs(dirresultstot[dt]))
 
@@ -186,18 +186,18 @@ def do_entire_analysis(  # pylint: disable=too-many-locals, too-many-statements,
 
     # check and create directories
     for dt in DataType:
-        if data_config["conversion"][dt.value]["activate"]:
+        if cfg(data_config, f"conversion.{dt.value}.activate"):
             checkmakedirlist(dirpkl[dt])
-        if data_config["skimming"][dt.value]["activate"]:
+        if cfg(data_config, f"skimming.{dt.value}.activate"):
             checkmakedirlist(dirpklsk[dt])
             checkmakedir(dirpklevtcounter_all[dt])
             exdirs.extend(checkdirs(dirpklsk[dt]))
             exdirs.extend(checkdirs(dirpklevtcounter_all[dt]))
-        if data_config["merging"][dt.value]["activate"]:
+        if cfg(data_config, f"merging.{dt.value}.activate"):
             checkmakedirlist(dirpklml[dt])
-        if data_config["mergingperiods"][dt.value]["activate"]:
+        if cfg(data_config, f"mergingperiods.{dt.value}.activate"):
             checkmakedir(dirpklmltot[dt])
-        if data_config["analysis"][dt.value]["histomass"]:
+        if cfg(data_config, f"analysis.{dt.value}.histomass"):
             checkmakedirlist(dirresults[dt])
             checkmakedir(dirresultstot[dt])
 
@@ -274,13 +274,13 @@ def mlhepmod(name):
         subprocess.call("../cplusutilities/Download.sh")
 
     for dt in DataType:
-        if data_config["conversion"][dt.value]["activate"]:
+        if cfg(data_config, f"conversion.{dt.value}.activate"):
             multiprocessor[dt].multi_unpack_allperiods()
-        if data_config["skimming"][dt.value]["activate"]:
+        if cfg(data_config, f"skimming.{dt.value}.activate"):
             multiprocessor[dt].multi_skim_allperiods()
-        if data_config["merging"][dt.value]["activate"]:
+        if cfg(data_config, f"merging.{dt.value}.activate"):
             multiprocessor[dt].multi_mergeml_allperiods()
-        if data_config["mergingperiods"][dt.value]["activate"]:
+        if cfg(data_config, f"mergingperiods.{dt.value}.activate"):
             multiprocessor[dt].multi_mergeml_allinone()
 
     if doml:
@@ -344,7 +344,7 @@ def mlhepmod(name):
         mymultiprocessmc.multi_mergeapply_allperiods()
 
     for dt in DataType:
-        if data_config["analysis"][dt.value]["histomass"]:
+        if cfg(data_config, f"analysis.{dt.value}.histomass"):
             multiprocessor[dt].multi_histomass()
 
     if doefficiency:
diff --git a/machine_learning_hep/submission/processor.yml b/machine_learning_hep/submission/processor.yml
index df4e00a242..b9d919d65d 100644
--- a/machine_learning_hep/submission/processor.yml
+++ b/machine_learning_hep/submission/processor.yml
@@ -60,10 +60,12 @@ analysis:
   # Do only merged (false)
   doperperiod: false
   data:
-    histomass: true # processer: process_histomass
+    histomass: false # processer: process_histomass
   mc:
-    histomass: true # processer: process_histomass
-    efficiency: true # processer: process_efficiency
+    histomass: false # processer: process_histomass
+    efficiency: false # processer: process_efficiency
+  fd:
+    histomass: true
   steps:
 
 systematics:

From ed820d10a9d70f04dbcfddcb352bba852d4d5490 Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Thu, 6 Mar 2025 17:23:08 +0100
Subject: [PATCH 06/20] Update

---
 machine_learning_hep/processer_jet.py         | 13 +++++++++----
 machine_learning_hep/submission/processor.yml |  4 ++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py
index 153e6d875e..b9eb660f24 100644
--- a/machine_learning_hep/processer_jet.py
+++ b/machine_learning_hep/processer_jet.py
@@ -161,7 +161,7 @@ def _verify_variables(self, dfi):
                 print(dfi[~mask][var], flush=True)
 
     def _calculate_variables(self, df, verify=False):  # pylint: disable=invalid-name
-        self.logger.info("calculating variables")
+        self.logger.info("calculating variables for DF %s", df.info())
         observables = self.cfg("observables", {})
         if len(df) == 0:
             for obs in observables:
@@ -231,7 +231,8 @@ def split_df(self, dfi, frac):
     def process_histomass_single(self, index):
         self.logger.info("Processing (histomass) %s", self.l_evtorig[index])
 
-        with TFile.Open(self.l_histomass[index], "recreate") as _:
+        print(f"Opening file {self.l_histomass[index]}", flush=True)
+        with TFile.Open(self.l_histomass[index], "recreate") as rfile:
             dfevtorig = read_df(self.l_evtorig[index])
             histonorm = TH1F("histonorm", "histonorm", 4, 0, 4)
             histonorm.SetBinContent(1, len(dfquery(dfevtorig, self.s_evtsel)))
@@ -255,8 +256,6 @@ def process_histomass_single(self, index):
             get_axis(histonorm, 0).SetBinLabel(4, "N_{BC}^{TVX}")
             histonorm.Write()
 
-            print(f"{self.mptfiles_recosk=}", flush=True)
-            print(f"{self.mptfiles_gensk=}", flush=True)
             if self.datatype != 'fd':
                 df = pd.concat(read_df(self.mptfiles_recosk[bin][index]) for bin in self.active_bins_skim)
             else:
@@ -280,7 +279,12 @@ def process_histomass_single(self, index):
                 self.binarray_ptjet,
                 self.binarray_pthf,
             )
+            print(f"Filling and writing histogram from {df.head()}", flush=True)
             fill_hist(h, df[["fM", "fJetPt", "fPt"]], write=True)
+            h.Write()
+            print("WRITING!!!!!!!!!!!!!!!")
+            rfile.WriteObject(h)
+            # h.Print("all")
 
             for sel_name, sel_spec in self.cfg("data_selections", {}).items():
                 if sel_spec["level"] == self.datatype:
@@ -308,6 +312,7 @@ def process_histomass_single(self, index):
                         df["idx_match"] = df[idx].apply(lambda ar: ar[0] if len(ar) > 0 else -1)
                         dfquery(df, "idx_match >= 0", inplace=True)
 
+            print(f"calculation", flush=True)
             self._calculate_variables(df)
 
             for obs, spec in self.cfg("observables", {}).items():
diff --git a/machine_learning_hep/submission/processor.yml b/machine_learning_hep/submission/processor.yml
index b9d919d65d..bde1f8dd7c 100644
--- a/machine_learning_hep/submission/processor.yml
+++ b/machine_learning_hep/submission/processor.yml
@@ -60,9 +60,9 @@ analysis:
   # Do only merged (false)
   doperperiod: false
   data:
-    histomass: false # processer: process_histomass
+    histomass: true # processer: process_histomass
   mc:
-    histomass: false # processer: process_histomass
+    histomass: true # processer: process_histomass
     efficiency: false # processer: process_efficiency
   fd:
     histomass: true

From 8d61e99fd62cafc6caf8f2f4df67f9b2d8a28fe2 Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Thu, 6 Mar 2025 17:43:06 +0100
Subject: [PATCH 07/20] Run all conversions

---
 .../database_ml_parameters_D0Jet_pp.yml       | 106 +++++++++++++-----
 machine_learning_hep/processer.py             |   3 +-
 2 files changed, 79 insertions(+), 30 deletions(-)

diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
index 2971c24933..cb23d56340 100644
--- a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
+++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
@@ -51,26 +51,53 @@ D0Jet_pp:
   #region dfs
   dfs:
     read:
-      # evtorig:
-      #   index: fIndexHfD0McCollBases
-      #   trees:
-      #     O2hfd0collbase: [fNumContrib, fPosZ]
-      #   filter: "abs(fPosZ) < 10."
-      # collcnt:
-      #   trees:
-      #     O2collcount: [fReadCounts, fReadCountsWithTVX, fReadCountsWithTVXAndZVertexAndSel8, fReadCountsWithTVXAndZVertexAndSelMC]
-      # bccnt:
-      #   trees:
-      #     O2bccount: [fReadCountsWithTVX, fReadCountsWithTVXAndNoTFB, fReadCountsWithTVXAndNoTFBAndNoITSROFB]
+      evtorig:
+        level: data
+        index: fIndexHfD0McCollBases
+        trees:
+          O2hfd0collbase: [fNumContrib, fPosZ]
+        filter: "abs(fPosZ) < 10."
+      collcnt:
+        level: data
+        trees:
+          O2collcount: [fReadCounts, fReadCountsWithTVX, fReadCountsWithTVXAndZVertexAndSel8, fReadCountsWithTVXAndZVertexAndSelMC]
+      bccnt:
+        level: data
+        trees:
+          O2bccount: [fReadCountsWithTVX, fReadCountsWithTVXAndNoTFB, fReadCountsWithTVXAndNoTFBAndNoITSROFB]
 
       collgen: # TODO: check if we can use the HF collision table instead
-        level: gen
+        level: mc
         index: fIndexD0CMCPJETCOS
         trees:
           O2d0cmcpjetco: [fPosZ, fCentrality, fEventSel]
         filter: "abs(fPosZ) < 10."
       jetgen:
-        level: gen
+        level: mc
+        index: fIndexD0CMCPJETOS
+        trees:
+          O2hfd0pbase: [fIndexHfD0McCollBases, fPt, fEta, fPhi, fFlagMcMatchGen, fOriginMcGen]
+          O2d0cmcpjeto: [fIndexD0CMCPJETCOS, fJetPt, fJetPhi, fJetEta, fJetNConstituents, fJetR]
+          # O2d0cmcpjetmo: [fIndexArrayD0CMCDJETOS_hf, fIndexArrayD0CMCDJETOS_geo, fIndexArrayD0CMCDJETOS_pt]
+          O2d0cmcpjetsso: [fEnergyMother, fPtLeading, fPtSubLeading, fTheta, fNSub2DR, fNSub1, fNSub2, fAngularity]
+        extra:
+          fY: log((sqrt(1.864**2 + (fPt * cosh(fEta))**2) + fPt * sinh(fEta)) / sqrt(1.864**2 + fPt**2)) #TODO : change mass or make sure Lc mass is updated
+          fM: 1.864
+        tags:
+          ismcsignal: {var: fFlagMcMatchGen, req: [[0], []], abs: true}
+          ismcbkg: {var: fFlagMcMatchGen, req: [[], [0]], abs: true}
+          ismcprompt: {var: fOriginMcGen, req: [[0], [1]]}
+          ismcfd: {var: fOriginMcGen, req: [[1], [0]]}
+        filter: "fPt >= 1. and abs(fY) <= 0.8 and abs(fJetEta) < (.9 - (fJetR / 100.))" # TODO: check jet eta cut
+
+      collfd: # TODO: check if we can use the HF collision table instead
+        level: fd
+        index: fIndexD0CMCPJETCOS
+        trees:
+          O2d0cmcpjetco: [fPosZ, fCentrality, fEventSel]
+        filter: "abs(fPosZ) < 10."
+      jetfd:
+        level: fd
         index: fIndexD0CMCPJETOS
         trees:
           O2hfd0pbase: [fIndexHFD0MCCOLLBASES, fPt, fEta, fPhi, fFlagMcMatchGen, fOriginMcGen]
@@ -88,13 +115,13 @@ D0Jet_pp:
         filter: "fPt >= 1. and abs(fY) <= 0.8 and abs(fJetEta) < (.9 - (fJetR / 100.))" # TODO: check jet eta cut
 
       colldet:
-        level: det
+        level: mc
         index: fIndexD0CMCDJETCOS
         trees:
           O2d0cmcdjetco: [fPosZ, fCentrality, fEventSel]
         filter: "abs(fPosZ) < 10."
       jetdet:
-        level: det
+        level: mc
         index: fIndexD0CMCDJETOS
         trees: # add EEC columns
           O2hfd0base: [fIndexHfD0CollBases, fPt, fEta, fPhi, fM]
@@ -146,30 +173,53 @@ D0Jet_pp:
           - {var: fMlScores, newvar: mlBkgScore, component: 0}
         filter: "fPt >= 1. and abs(fY) <= 0.8 and abs(fJetEta) < (.9 - (fJetR / 100.))" # TODO: check jet eta cut
 
-    # merge:
-    #   - {base: jetgen, ref: evtorig}
-    #   - {base: jetdet, ref: colldet}
-    #   - {base: jetdata, ref: colldata}
+    merge:
+      - {base: jetgen, ref: collgen}
+      - {base: jetfd, ref: collfd}
+      - {base: jetdet, ref: colldet}
+      - {base: jetdata, ref: colldata}
     # workaround for yamlfmt issue #110
 
     write:
       jetgen:
-        level: gen
+        level: mc
+        file: AnalysisResultsGen.parquet
+      jetfd:
+        level: fd
         file: AnalysisResultsGen.parquet
       jetdet:
-        level: det
+        level: mc
         file: AnalysisResultsReco.parquet
       jetdata:
         level: data
         file: AnalysisResultsReco.parquet
-      evtorig:
-        level: all
+        
+      evtorig_mc:
+        level: mc
         source: collgen
         file: AnalysisResultsEvtOrig.parquet
-      evt:
-        level: all
+      evt_mc:
+        level: mc
         source: collgen
         file: AnalysisResultsEvt.parquet
+        
+      evtorig_data:
+        level: data
+        source: evtorig
+        file: AnalysisResultsEvtOrig.parquet
+      evt_data:
+        level: data
+        source: evtorig
+        file: AnalysisResultsEvt.parquet
+        
+      evtorig_fd:
+        level: fd
+        source: collfd
+        file: AnalysisResultsEvtOrig.parquet
+      evt_fd:
+        level: fd
+        source: collfd
+        file: AnalysisResultsEvt.parquet
       # collcnt:
       #   level: all
       #   file: AnalysisResultsCollCnt.parquet
@@ -223,10 +273,10 @@ D0Jet_pp:
 
   files_names:
     namefile_unmerged_tree: AO2D.root
-    # namefile_reco: AnalysisResultsReco.parquet
+    namefile_reco: AnalysisResultsReco.parquet
     namefile_evt: AnalysisResultsEvt.parquet
-    # namefile_collcnt: AnalysisResultsCollCnt.parquet
-    # namefile_bccnt: AnalysisResultsBcCnt.parquet
+    namefile_collcnt: AnalysisResultsCollCnt.parquet
+    namefile_bccnt: AnalysisResultsBcCnt.parquet
     namefile_evtvalroot: AnalysisResultsROOTEvtVal.root
     namefile_evtorig: AnalysisResultsEvtOrig.parquet
     namefile_gen: AnalysisResultsGen.parquet
diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py
index 5a68ad1143..b33b937ac4 100644
--- a/machine_learning_hep/processer.py
+++ b/machine_learning_hep/processer.py
@@ -419,7 +419,7 @@ def dfuse(df_spec):
             return (
                 (level == "all")
                 or (level in ("mc", "gen", "det") and self.datatype == "mc")
-                or (level in ("mc", "gen") and self.datatype == "fd")
+                or (level in ("fd", "gen") and self.datatype == "fd")
                 or (level in ("data") and self.datatype == "data")
             )
 
@@ -540,7 +540,6 @@ def dfuse(df_spec):
                     write_df(dfo, path)
 
     def skim(self, file_index):
-        print("//////////////// skimming ///////////")
         dfreco = read_df(self.l_reco[file_index]) if self.datatype != "fd" else None
         dfgen = read_df(self.l_gen[file_index]) if self.datatype in ('mc', 'fd') else None
         dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.n_gen_sl and self.datatype in ('mc', 'fd') else None

From 77133a1c15afec0e4596e026b9e188a343781c06 Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Fri, 29 Aug 2025 10:20:52 +0200
Subject: [PATCH 08/20] Fix

---
 .../data/data_run3/database_ml_parameters_D0Jet_pp.yml    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
index cb23d56340..1eeca8139d 100644
--- a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
+++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
@@ -84,10 +84,10 @@ D0Jet_pp:
           fY: log((sqrt(1.864**2 + (fPt * cosh(fEta))**2) + fPt * sinh(fEta)) / sqrt(1.864**2 + fPt**2)) #TODO : change mass or make sure Lc mass is updated
           fM: 1.864
         tags:
-          ismcsignal: {var: fFlagMcMatchGen, req: [[0], []], abs: true}
-          ismcbkg: {var: fFlagMcMatchGen, req: [[], [0]], abs: true}
-          ismcprompt: {var: fOriginMcGen, req: [[0], [1]]}
-          ismcfd: {var: fOriginMcGen, req: [[1], [0]]}
+          ismcsignal: {var: fFlagMcMatchGen, req: 1, abs: true, level: mc}
+          ismcbkg: {var: ismcsignal, req: 0, level: mc}
+          ismcprompt: {var: fOriginMcGen, req: 1, level: mc}
+          ismcfd: {var: fOriginMcGen, req: 2, level: mc}
         filter: "fPt >= 1. and abs(fY) <= 0.8 and abs(fJetEta) < (.9 - (fJetR / 100.))" # TODO: check jet eta cut
 
       collfd: # TODO: check if we can use the HF collision table instead

From 2c80e4f38ecee0f2ef6403b7737b47d4ffd01ff8 Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Thu, 6 Mar 2025 18:01:38 +0100
Subject: [PATCH 09/20] Run processor

---
 .../database_ml_parameters_D0Jet_pp.yml       | 26 ++++++++++++++-----
 machine_learning_hep/processer.py             |  4 +--
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
index 1eeca8139d..2d6db3eb08 100644
--- a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
+++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
@@ -57,6 +57,14 @@ D0Jet_pp:
         trees:
           O2hfd0collbase: [fNumContrib, fPosZ]
         filter: "abs(fPosZ) < 10."
+      collcnt_mc:
+        level: mc
+        trees:
+          O2collcount: [fReadCounts, fReadCountsWithTVX, fReadCountsWithTVXAndZVertexAndSel8, fReadCountsWithTVXAndZVertexAndSelMC]
+      bccnt_mc:
+        level: mc
+        trees:
+          O2bccount: [fReadCountsWithTVX, fReadCountsWithTVXAndNoTFB, fReadCountsWithTVXAndNoTFBAndNoITSROFB]
       collcnt:
         level: data
         trees:
@@ -220,12 +228,18 @@ D0Jet_pp:
         level: fd
         source: collfd
         file: AnalysisResultsEvt.parquet
-      # collcnt:
-      #   level: all
-      #   file: AnalysisResultsCollCnt.parquet
-      # bccnt:
-      #   level: all
-      #   file: AnalysisResultsBcCnt.parquet
+      collcnt_mc:
+        level: mc
+        file: AnalysisResultsCollCnt.parquet
+      bccnt_mc:
+        level: mc
+        file: AnalysisResultsBcCnt.parquet
+      collcnt:
+        level: data
+        file: AnalysisResultsCollCnt.parquet
+      bccnt:
+        level: data
+        file: AnalysisResultsBcCnt.parquet
 
   variables:
     var_all: [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPiExpPi, fNSigTpcKaExpPi, fNSigTpcPiExpKa, fNSigTpcKaExpKa]
diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py
index b33b937ac4..d743798cdd 100644
--- a/machine_learning_hep/processer.py
+++ b/machine_learning_hep/processer.py
@@ -194,8 +194,8 @@ def __init__(
         self.l_gen = createlist(self.d_pkl, self.l_path, self.n_gen)
         self.l_evt = createlist(self.d_pkl, self.l_path, self.n_evt)
         self.l_evtorig = createlist(self.d_pkl, self.l_path, self.n_evtorig)
-        self.l_collcnt = createlist(self.d_pkl, self.l_path, self.n_collcnt)
-        self.l_bccnt = createlist(self.d_pkl, self.l_path, self.n_bccnt)
+        self.l_collcnt = createlist(self.d_pkl, self.l_path, self.n_collcnt) if self.datatype != "fd" else None
+        self.l_bccnt = createlist(self.d_pkl, self.l_path, self.n_bccnt) if self.datatype != "fd" else None
         self.l_histomass = createlist(self.d_results, self.l_path, self.n_filemass)
         self.l_histoeff = createlist(self.d_results, self.l_path, self.n_fileeff)
         # self.l_historesp = createlist(self.d_results, self.l_path, self.n_fileresp)

From 0b2f9817ba54c8ae1d8bd5ac4bc4c0c894e26ba1 Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Tue, 11 Mar 2025 10:29:14 +0100
Subject: [PATCH 10/20] Add submission for feeddown simulation

---
 machine_learning_hep/submission/fd.yml | 85 ++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 machine_learning_hep/submission/fd.yml

diff --git a/machine_learning_hep/submission/fd.yml b/machine_learning_hep/submission/fd.yml
new file mode 100644
index 0000000000..3349a16fd6
--- /dev/null
+++ b/machine_learning_hep/submission/fd.yml
@@ -0,0 +1,85 @@
+---
+case: XXXX # used to find the database file unless specified explicitly as do_entire_analysis -d database_analysis
+download:
+  alice:
+    activate: false
+conversion: # pkl
+  mc:
+    activate: false
+  fd:
+    activate: true
+  data:
+    activate: false
+skimming: # pkl_skimmed (pklsk), pkl_evtcounter_all
+  mc:
+    activate: false
+  fd:
+    activate: true
+  data:
+    activate: false
+merging: # pkl_skimmed_merge_for_ml (pklskml)
+  mc:
+    activate: false
+  fd:
+    activate: false
+  data:
+    activate: false
+mergingperiods: # pkl_skimmed_merge_for_ml_all
+  mc:
+    activate: false
+  fd:
+    activate: false
+  data:
+    activate: false
+
+ml_study: # mlout, mlplot
+  activate: false
+  dotraining: false
+  dotesting: false
+  doplotdistr: false
+  doroc: false
+  doroctraintest: false
+  doimportance: false
+  doimportanceshap: false
+  docorrelation: false
+  dolearningcurve: false
+  doapplytodatamc: false
+  doscancuts: false
+  doefficiency: false
+  dosignifopt: false
+  doboundary: false
+  docrossvalidation: false
+  dogridsearch: false
+  dobayesianopt: false
+
+mlapplication:
+  data:
+    doapply: false # pkl_skimmed_dec (pklskdec)
+    domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
+    docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
+  mc:
+    doapply: false # pkl_skimmed_dec (pklskdec)
+    domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
+    docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
+
+analysis:
+  type: "YYYY" # used unless specified explicitly as do_entire_analysis -a type_ana
+  # Do each period separately including merged (true)
+  # Do only merged (false)
+  doperperiod: false
+  data:
+    histomass: false # processer: process_histomass
+  fd:
+    histomass: false
+  mc:
+    histomass: false # processer: process_histomass
+    efficiency: false # processer: process_efficiency
+  steps:
+
+systematics:
+  cutvar:
+    activate: false
+    do_only_analysis: false # This can be done anytime when mass and efficiency histograms have been produced already for a number of trials
+    resume: false # already done mass and efficiency histograms will not be done again, continue with left trials
+  mcptshape:
+    activate: false

From 96b4a758d497fc3da77970c938841128d7f03d1e Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Tue, 11 Mar 2025 13:25:57 +0100
Subject: [PATCH 11/20] Extend jet analyzer to read feeddown info from
 processor

---
 .../analysis/analyzer_jets.py                 | 111 ++++++++++--------
 .../database_ml_parameters_D0Jet_pp.yml       |   6 +-
 machine_learning_hep/submission/processor.yml |   2 +-
 3 files changed, 69 insertions(+), 50 deletions(-)

diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py
index ba4b1da86a..941957e327 100644
--- a/machine_learning_hep/analysis/analyzer_jets.py
+++ b/machine_learning_hep/analysis/analyzer_jets.py
@@ -65,15 +65,18 @@ def __init__(self, datap, case, typean, period):
         suffix = f"results.{period}" if period is not None else "resultsallp"
         self.d_resultsallpmc = self.cfg(f"mc.{suffix}")
         self.d_resultsallpdata = self.cfg(f"data.{suffix}")
+        self.d_resultsallpfd = self.cfg(f"fd.{suffix}")
 
         # input directories (processor output)
         self.d_resultsallpdata_proc = self.cfg(f"data_proc.{suffix}")
         self.d_resultsallpmc_proc = self.cfg(f"mc_proc.{suffix}")
+        self.d_resultsallpfd_proc = self.cfg(f"fd_proc.{suffix}")
 
         # input files
         n_filemass_name = datap["files_names"]["histofilename"]
         self.n_filemass = os.path.join(self.d_resultsallpdata_proc, n_filemass_name)
         self.n_filemass_mc = os.path.join(self.d_resultsallpmc_proc, n_filemass_name)
+        self.n_filemass_fd = os.path.join(self.d_resultsallpfd_proc, n_filemass_name)
         self.n_fileeff = datap["files_names"]["efffilename"]
         self.n_fileeff = os.path.join(self.d_resultsallpmc_proc, self.n_fileeff)
         self.n_fileresp = datap["files_names"]["respfilename"]
@@ -1063,57 +1066,73 @@ def _extract_signal(self, hist, var, mcordata, ipt):
         # hres.Sumw2() # TODO: check if we should do this here
         return hres
 
-    # region feeddown
-    # pylint: disable=too-many-statements
-    def estimate_feeddown(self):
-        self.logger.info("Estimating feeddown")
 
-        with TFile(self.cfg("fd_root")) as rfile:
-            powheg_xsection = rfile.Get("fHistXsection")
-            powheg_xsection_scale_factor = powheg_xsection.GetBinContent(1) / powheg_xsection.GetEntries()
-        self.logger.info("POWHEG luminosity (mb^{-1}): %g", 1.0 / powheg_xsection_scale_factor)
+    def estimate_feeddown(self):
+        """Estimate feeddown from legacy Run 2 trees or gen-only simulation"""
+        match self.cfg("fd_input", "tree"):
+            case "tree":
+                with TFile(self.cfg("fd_root")) as rfile:
+                    powheg_xsection = rfile.Get("fHistXsection")
+                    powheg_xsection_scale_factor = powheg_xsection.GetBinContent(1) / powheg_xsection.GetEntries()
+                self.logger.info("POWHEG luminosity (mb^{-1}): %g", 1.0 / powheg_xsection_scale_factor)
+
+                df = pd.read_parquet(self.cfg("fd_parquet"))
+                col_mapping = {"dr": "delta_r_jet", "zpar": "z"}  # TODO: check mapping
+
+                # TODO: generalize to higher dimensions
+                h3_fd_gen_orig = {}
+                for var in self.observables["all"]:
+                    bins_ptjet = np.asarray(self.cfg("bins_ptjet"), "d")
+                    # TODO: generalize or derive from histogram?
+                    bins_obs = {}
+                    if binning := self.cfg(f"observables.{var}.bins_gen_var"):
+                        bins_tmp = np.asarray(binning, "d")
+                    elif binning := self.cfg(f"observables.{var}.bins_gen_fix"):
+                        bins_tmp = bin_array(*binning)
+                    elif binning := self.cfg(f"observables.{var}.bins_var"):
+                        bins_tmp = np.asarray(binning, "d")
+                    elif binning := self.cfg(f"observables.{var}.bins_fix"):
+                        bins_tmp = bin_array(*binning)
+                    else:
+                        self.logger.error("no binning specified for %s, using defaults", var)
+                        bins_tmp = bin_array(10, 0.0, 1.0)
+                    bins_obs[var] = bins_tmp
+
+                    colname = col_mapping.get(var, f"{var}_jet")
+                    if f"{colname}" not in df:
+                        if var is not None:
+                            self.logger.error("No feeddown information for %s (%s), cannot estimate feeddown", var, colname)
+                            # print(df.info(), flush=True)
+                        continue
 
-        df = pd.read_parquet(self.cfg("fd_parquet"))
-        col_mapping = {"dr": "delta_r_jet", "zpar": "z"}  # TODO: check mapping
+                    # TODO: derive histogram
+                    h3_fd_gen_orig[var] = create_hist(
+                        "h3_feeddown_gen",
+                        f";p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{var}",
+                        bins_ptjet,
+                        self.bins_candpt,
+                        bins_obs[var],
+                    )
+                    fill_hist_fast(h3_fd_gen_orig, df[["pt_jet", "pt_cand", f"{colname}"]])
+                    self._save_hist(project_hist(h3_fd_gen_orig, [0, 2], {}), f"fd/h_ptjet-{var}_feeddown_gen_noeffscaling.png")
+
+            case "sim":
+                # TODO: recover cross section
+                h3_fd_gen_orig = {}
+                with TFile(self.n_filemass_fd) as rfile:
+                    for var in self.observables["all"]:
+                        self.logger.info("Running feeddown analysis for obs. %s", var)
+                        label = f"-{var}" if var else ""
+                        if fh := rfile.Get(f"h_mass-ptjet-pthf{label}"):
+                            h3_fd_gen_orig[var] = project_hist(fh, list(range(1, get_dim(fh))), {})
+                            ensure_sumw2(h3_fd_gen_orig[var])
+
+            case fd_input:
+                self.logger.critical("Invalid feeddown input %s", fd_input)
 
-        # TODO: generalize to higher dimensions
         for var in self.observables["all"]:
-            bins_ptjet = np.asarray(self.cfg("bins_ptjet"), "d")
-            # TODO: generalize or derive from histogram?
-            bins_obs = {}
-            if binning := self.cfg(f"observables.{var}.bins_gen_var"):
-                bins_tmp = np.asarray(binning, "d")
-            elif binning := self.cfg(f"observables.{var}.bins_gen_fix"):
-                bins_tmp = bin_array(*binning)
-            elif binning := self.cfg(f"observables.{var}.bins_var"):
-                bins_tmp = np.asarray(binning, "d")
-            elif binning := self.cfg(f"observables.{var}.bins_fix"):
-                bins_tmp = bin_array(*binning)
-            else:
-                self.logger.error("no binning specified for %s, using defaults", var)
-                bins_tmp = bin_array(10, 0.0, 1.0)
-            bins_obs[var] = bins_tmp
-
-            colname = col_mapping.get(var, f"{var}_jet")
-            if f"{colname}" not in df:
-                if var is not None:
-                    self.logger.error("No feeddown information for %s (%s), cannot estimate feeddown", var, colname)
-                    # print(df.info(), flush=True)
-                continue
-
-            # TODO: derive histogram
-            h3_fd_gen_orig = create_hist(
-                "h3_feeddown_gen",
-                f";p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{var}",
-                bins_ptjet,
-                self.bins_candpt,
-                bins_obs[var],
-            )
-            fill_hist_fast(h3_fd_gen_orig, df[["pt_jet", "pt_cand", f"{colname}"]])
-            self._save_hist(project_hist(h3_fd_gen_orig, [0, 2], {}), f"fd/h_ptjet-{var}_feeddown_gen_noeffscaling.png")
-
             # new method
-            h3_fd_gen = h3_fd_gen_orig.Clone()
+            h3_fd_gen = h3_fd_gen_orig[var].Clone()
             ensure_sumw2(h3_fd_gen)
             self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f"fd/h_ptjet-{var}_fdnew_gen.png")
             # apply np efficiency
diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
index 2d6db3eb08..b23ac85b40 100644
--- a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
+++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
@@ -308,7 +308,7 @@ D0Jet_pp:
   multi:
     data:
       nprocessesparallel: 80
-      maxfiles: [1] #list of periods
+      maxfiles: [-1] #list of periods
       chunksizeunp: [100] #list of periods
       chunksizeskim: [100] #list of periods
       fracmerge: [.1] #list of periods
@@ -325,7 +325,7 @@ D0Jet_pp:
       mcreweights: [../Analyses] #list of periods
     mc:
       nprocessesparallel: 80
-      maxfiles: [1] #list of periods
+      maxfiles: [-1] #list of periods
       chunksizeunp: [100] #list of periods
       chunksizeskim: [1000] #list of periods
       fracmerge: [1.] #list of periods
@@ -342,7 +342,7 @@ D0Jet_pp:
       mcreweights: [../Analyses] #list of periods
     fd:
       nprocessesparallel: 80
-      maxfiles: [1] #list of periods
+      maxfiles: [-1] #list of periods
       chunksizeunp: [100] #list of periods
       chunksizeskim: [1000] #list of periods
       fracmerge: [1.] #list of periods
diff --git a/machine_learning_hep/submission/processor.yml b/machine_learning_hep/submission/processor.yml
index bde1f8dd7c..de70f15aa2 100644
--- a/machine_learning_hep/submission/processor.yml
+++ b/machine_learning_hep/submission/processor.yml
@@ -63,7 +63,7 @@ analysis:
     histomass: true # processer: process_histomass
   mc:
     histomass: true # processer: process_histomass
-    efficiency: false # processer: process_efficiency
+    efficiency: true # processer: process_efficiency
   fd:
     histomass: true
   steps:

From 51102acb94679c6ba3ed4eb6fb20bc100df532fe Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Tue, 18 Mar 2025 11:33:10 +0100
Subject: [PATCH 12/20] Test with latest Lc trains

---
 .../database_ml_parameters_LcJet_pp.yml       | 498 ++++++++++++++++--
 1 file changed, 441 insertions(+), 57 deletions(-)

diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_LcJet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_LcJet_pp.yml
index 322414792f..3e33ac4e02 100644
--- a/machine_learning_hep/data/data_run3/database_ml_parameters_LcJet_pp.yml
+++ b/machine_learning_hep/data/data_run3/database_ml_parameters_LcJet_pp.yml
@@ -19,8 +19,32 @@ LcJet_pp:
   sel_cen_unp: null
   sel_good_evt_unp: null # "fIsEventReject == 0"
   # sel_reco_skim: ["mlPromptScore > 0.96", "mlPromptScore > 0.97", "mlPromptScore > 0.9", "mlPromptScore > 0.85", "mlPromptScore > 0.8", "mlPromptScore > 0.6", null]  # (sel_skim_binmin bins)
-  sel_reco_skim: [null, null, null, null, null, null, null, null, null, null, null] # (sel_skim_binmin bins)
-  sel_gen_skim: [null, null, null, null, null, null, null, null, null, null, null] # (sel_skim_binmin bins)
+  sel_reco_skim: [
+      null,
+      null,
+      null,
+      null,
+      null,
+      null,
+      null,
+      null,
+      null,
+      null,
+      null,
+    ] # (sel_skim_binmin bins)
+  sel_gen_skim: [
+      null,
+      null,
+      null,
+      null,
+      null,
+      null,
+      null,
+      null,
+      null,
+      null,
+      null,
+    ] # (sel_skim_binmin bins)
   sel_skim_binmin: [1, 2, 3, 4, 5, 6, 7, 8, 10, 12] # skimming pt bins (sel_skim_binmin bins)
   sel_skim_binmax: [2, 3, 4, 5, 6, 7, 8, 10, 12, 30] # skimming pt bins (sel_skim_binmin bins)
   var_binning: fPt
@@ -57,10 +81,21 @@ LcJet_pp:
           fIsEventReject: 0
       collcnt:
         trees:
-          O2collcount: [fReadCounts, fReadCountsWithTVX, fReadCountsWithTVXAndZVertexAndSel8, fReadCountsWithTVXAndZVertexAndSelMC]
+          O2collcount:
+            [
+              fReadCounts,
+              fReadCountsWithTVX,
+              fReadCountsWithTVXAndZVertexAndSel8,
+              fReadCountsWithTVXAndZVertexAndSelMC,
+            ]
       bccnt:
         trees:
-          O2bccount: [fReadCountsWithTVX, fReadCountsWithTVXAndNoTFB, fReadCountsWithTVXAndNoTFBAndNoITSROFB]
+          O2bccount:
+            [
+              fReadCountsWithTVX,
+              fReadCountsWithTVXAndNoTFB,
+              fReadCountsWithTVXAndNoTFBAndNoITSROFB,
+            ]
 
       # collgen:
       #   level: gen
@@ -72,15 +107,41 @@ LcJet_pp:
         index: fIndexLCMCPJETOS
         trees:
           O2hflcpbase: [fPt, fY, fEta, fPhi, fFlagMcMatchGen, fOriginMcGen]
-          O2lccmcpjeto: [fIndexLCCMCPJETCOS, fIndexHFLCPBASES_0, fJetPt, fJetPhi, fJetEta, fJetNConstituents, fJetR]
-          O2lccmcpjetmo: [fIndexArrayLCCMCDJETOS_hf, fIndexArrayLCCMCDJETOS_geo, fIndexArrayLCCMCDJETOS_pt]
-          O2lccmcpjetsso: [fEnergyMother, fPtLeading, fPtSubLeading, fTheta, fNSub2DR, fNSub1, fNSub2, fPairTheta, fPairPt, fPairEnergy]
+          O2lccmcpjeto:
+            [
+              fIndexLCCMCPJETCOS,
+              fIndexHFLCPBASES_0,
+              fJetPt,
+              fJetPhi,
+              fJetEta,
+              fJetNConstituents,
+              fJetR,
+            ]
+          O2lccmcpjetmo:
+            [
+              fIndexArrayLCCMCDJETOS_hf,
+              fIndexArrayLCCMCDJETOS_geo,
+              fIndexArrayLCCMCDJETOS_pt,
+            ]
+          O2lccmcpjetsso:
+            [
+              fEnergyMother,
+              fPtLeading,
+              fPtSubLeading,
+              fTheta,
+              fNSub2DR,
+              fNSub1,
+              fNSub2,
+              fPairTheta,
+              fPairPt,
+              fPairEnergy,
+            ]
         tags:
-          isstd: {var: fFlagMcMatchGen, req: [[1], []]}
-          ismcsignal: {var: fFlagMcMatchGen, req: [[1], []], abs: true}
-          ismcbkg: {var: fFlagMcMatchGen, req: [[], [1]], abs: true}
-          ismcprompt: {var: fOriginMcGen, req: [[0], []]}
-          ismcfd: {var: fOriginMcGen, req: [[1], []]}
+          isstd: { var: fFlagMcMatchGen, req: [[1], []] }
+          ismcsignal: { var: fFlagMcMatchGen, req: [[1], []], abs: true }
+          ismcbkg: { var: fFlagMcMatchGen, req: [[], [1]], abs: true }
+          ismcprompt: { var: fOriginMcGen, req: [[0], []] }
+          ismcfd: { var: fOriginMcGen, req: [[1], []] }
         filter: "fPt >= 1. and abs(fY) <= 0.8 and abs(fJetEta) < (.9 - (fJetR / 100.))" # TODO: check jet eta cut
 
       colldet:
@@ -98,18 +159,44 @@ LcJet_pp:
           # O2hflcpare: [fErrorDecayLength, fErrorDecayLengthXY, fErrorImpactParameter0, fErrorImpactParameter1]
           O2hflcsel: [fCandidateSelFlag]
           O2hflcml: [fMlScores]
-          O2lccmcdjeto: [fIndexLCCMCDJETCOS, fIndexHFLCBASES_0, fJetPt, fJetPhi, fJetEta, fJetNConstituents, fJetR]
-          O2lccmcdjetmo: [fIndexArrayLCCMCPJETOS_hf, fIndexArrayLCCMCPJETOS_geo, fIndexArrayLCCMCPJETOS_pt]
-          O2lccmcdjetsso: [fEnergyMother, fPtLeading, fPtSubLeading, fTheta, fNSub2DR, fNSub1, fNSub2, fPairTheta, fPairPt, fPairEnergy]
+          O2lccmcdjeto:
+            [
+              fIndexLCCMCDJETCOS,
+              fIndexHFLCBASES_0,
+              fJetPt,
+              fJetPhi,
+              fJetEta,
+              fJetNConstituents,
+              fJetR,
+            ]
+          O2lccmcdjetmo:
+            [
+              fIndexArrayLCCMCPJETOS_hf,
+              fIndexArrayLCCMCPJETOS_geo,
+              fIndexArrayLCCMCPJETOS_pt,
+            ]
+          O2lccmcdjetsso:
+            [
+              fEnergyMother,
+              fPtLeading,
+              fPtSubLeading,
+              fTheta,
+              fNSub2DR,
+              fNSub1,
+              fNSub2,
+              fPairTheta,
+              fPairPt,
+              fPairEnergy,
+            ]
         tags:
-          isstd: {var: fFlagMcMatchRec, req: [[1], []]}
-          ismcsignal: {var: fFlagMcMatchRec, req: [[1], []], abs: true}
-          ismcbkg: {var: fFlagMcMatchRec, req: [[], [1]], abs: true}
-          ismcprompt: {var: fOriginMcRec, req: [[0], []]}
-          ismcfd: {var: fOriginMcRec, req: [[1], []]}
+          isstd: { var: fFlagMcMatchRec, req: [[1], []] }
+          ismcsignal: { var: fFlagMcMatchRec, req: [[1], []], abs: true }
+          ismcbkg: { var: fFlagMcMatchRec, req: [[], [1]], abs: true }
+          ismcprompt: { var: fOriginMcRec, req: [[0], []] }
+          ismcfd: { var: fOriginMcRec, req: [[1], []] }
         extract_component:
-          - {var: fMlScores, newvar: mlPromptScore, component: 1}
-          - {var: fMlScores, newvar: mlBkgScore, component: 0}
+          - { var: fMlScores, newvar: mlPromptScore, component: 1 }
+          - { var: fMlScores, newvar: mlBkgScore, component: 0 }
         filter: "fPt >= 1. and abs(fY) <= 0.8 and abs(fJetEta) < (.9 - (fJetR / 100.))" # TODO: check jet eta cut
         # swap: {cand: fCandidateSelFlag, var_swap: fIsCandidateSwapped, vars: [ismcsignal, ismcprompt, icmcfd]}
 
@@ -127,18 +214,40 @@ LcJet_pp:
           # O2hflcpare: [fErrorDecayLength, fErrorDecayLengthXY, fErrorImpactParameter0, fErrorImpactParameter1]
           O2hflcsel: [fCandidateSelFlag]
           O2hflcml: [fMlScores]
-          O2lccjeto: [fIndexLCCJETCOS, fIndexHFLCBASES_0, fJetPt, fJetPhi, fJetEta, fJetNConstituents, fJetR]
-          O2lccjetsso: [fIndexLCCJETOS, fEnergyMother, fPtLeading, fPtSubLeading, fTheta, fNSub2DR, fNSub1, fNSub2, fPairTheta, fPairPt, fPairEnergy]
+          O2lccjeto:
+            [
+              fIndexLCCJETCOS,
+              fIndexHFLCBASES_0,
+              fJetPt,
+              fJetPhi,
+              fJetEta,
+              fJetNConstituents,
+              fJetR,
+            ]
+          O2lccjetsso:
+            [
+              fIndexLCCJETOS,
+              fEnergyMother,
+              fPtLeading,
+              fPtSubLeading,
+              fTheta,
+              fNSub2DR,
+              fNSub1,
+              fNSub2,
+              fPairTheta,
+              fPairPt,
+              fPairEnergy,
+            ]
         extract_component:
-          - {var: fMlScores, newvar: mlPromptScore, component: 1}
-          - {var: fMlScores, newvar: mlBkgScore, component: 0}
+          - { var: fMlScores, newvar: mlPromptScore, component: 1 }
+          - { var: fMlScores, newvar: mlBkgScore, component: 0 }
         filter: "fPt >= 1. and abs(fY) <= 0.8 and abs(fJetEta) < (.9 - (fJetR / 100.))" # TODO: check jet eta cut
 
     merge:
       #- {base: jetgen, ref: collgen}
-      - {base: jetdet, ref: colldet}
-      - {base: jetdata, ref: colldata}
-      - {base: jetdata, ref: evtorig}
+      - { base: jetdet, ref: colldet }
+      - { base: jetdata, ref: colldata }
+      - { base: jetdata, ref: evtorig }
     # workaround for yamlfmt issue #110
 
     write:
@@ -167,14 +276,130 @@ LcJet_pp:
         file: AnalysisResultsBcCnt.parquet
 
   variables:
-    var_all: [fIndexCollisions, fFlagMcMatchRec, fCandidateSelFlag, fOriginMcRec, fIsCandidateSwapped, fNProngsContributorsPV, fY, fEta, fPt, fCpa, fCpaXY, fM, fChi2PCA, fDecayLength, fDecayLengthXY, fPtProng0, fPtProng1, fPtProng2, fImpactParameter0, fImpactParameter1, fImpactParameter2, fNSigTpcPi0, fNSigTpcPr0, fNSigTpcKa1, fNSigTpcPi2, fNSigTpcPr2, fNSigTofPi0, fNSigTofPr0, fNSigTofKa1, fNSigTofPi2, fNSigTofPr2, fNSigTpcTofPi0, fNSigTpcTofPr0, fNSigTpcTofKa1, fNSigTpcTofPi2, fNSigTpcTofPr2]
+    var_all:
+      [
+        fIndexCollisions,
+        fFlagMcMatchRec,
+        fCandidateSelFlag,
+        fOriginMcRec,
+        fIsCandidateSwapped,
+        fNProngsContributorsPV,
+        fY,
+        fEta,
+        fPt,
+        fCpa,
+        fCpaXY,
+        fM,
+        fChi2PCA,
+        fDecayLength,
+        fDecayLengthXY,
+        fPtProng0,
+        fPtProng1,
+        fPtProng2,
+        fImpactParameter0,
+        fImpactParameter1,
+        fImpactParameter2,
+        fNSigTpcPi0,
+        fNSigTpcPr0,
+        fNSigTpcKa1,
+        fNSigTpcPi2,
+        fNSigTpcPr2,
+        fNSigTofPi0,
+        fNSigTofPr0,
+        fNSigTofKa1,
+        fNSigTofPi2,
+        fNSigTofPr2,
+        fNSigTpcTofPi0,
+        fNSigTpcTofPr0,
+        fNSigTpcTofKa1,
+        fNSigTpcTofPi2,
+        fNSigTpcTofPr2,
+      ]
     var_training: # sel_skim_binmin bins
-      - [fImpactParameter0, fImpactParameter1, fImpactParameter2, fCpa, fChi2PCA, fDecayLength, fDecayLengthXY, fNSigTpcTofPi0, fNSigTpcTofPr0, fNSigTpcTofKa1, fNSigTpcTofPi2, fNSigTpcTofPr2]
-      - [fImpactParameter0, fImpactParameter1, fImpactParameter2, fCpa, fChi2PCA, fDecayLength, fDecayLengthXY, fNSigTpcTofPi0, fNSigTpcTofPr0, fNSigTpcTofKa1, fNSigTpcTofPi2, fNSigTpcTofPr2]
-      - [fImpactParameter0, fImpactParameter1, fImpactParameter2, fCpa, fChi2PCA, fDecayLength, fDecayLengthXY, fNSigTpcTofPi0, fNSigTpcTofPr0, fNSigTpcTofKa1, fNSigTpcTofPi2, fNSigTpcTofPr2]
-      - [fImpactParameter0, fImpactParameter1, fImpactParameter2, fCpa, fChi2PCA, fDecayLength, fDecayLengthXY, fNSigTpcTofPi0, fNSigTpcTofPr0, fNSigTpcTofKa1, fNSigTpcTofPi2, fNSigTpcTofPr2]
-      - [fImpactParameter0, fImpactParameter1, fImpactParameter2, fCpa, fChi2PCA, fDecayLength, fDecayLengthXY, fNSigTpcTofPi0, fNSigTpcTofPr0, fNSigTpcTofKa1, fNSigTpcTofPi2, fNSigTpcTofPr2]
-      - [fImpactParameter0, fImpactParameter1, fImpactParameter2, fCpa, fChi2PCA, fDecayLength, fDecayLengthXY, fNSigTpcTofPi0, fNSigTpcTofPr0, fNSigTpcTofKa1, fNSigTpcTofPi2, fNSigTpcTofPr2]
+      - [
+          fImpactParameter0,
+          fImpactParameter1,
+          fImpactParameter2,
+          fCpa,
+          fChi2PCA,
+          fDecayLength,
+          fDecayLengthXY,
+          fNSigTpcTofPi0,
+          fNSigTpcTofPr0,
+          fNSigTpcTofKa1,
+          fNSigTpcTofPi2,
+          fNSigTpcTofPr2,
+        ]
+      - [
+          fImpactParameter0,
+          fImpactParameter1,
+          fImpactParameter2,
+          fCpa,
+          fChi2PCA,
+          fDecayLength,
+          fDecayLengthXY,
+          fNSigTpcTofPi0,
+          fNSigTpcTofPr0,
+          fNSigTpcTofKa1,
+          fNSigTpcTofPi2,
+          fNSigTpcTofPr2,
+        ]
+      - [
+          fImpactParameter0,
+          fImpactParameter1,
+          fImpactParameter2,
+          fCpa,
+          fChi2PCA,
+          fDecayLength,
+          fDecayLengthXY,
+          fNSigTpcTofPi0,
+          fNSigTpcTofPr0,
+          fNSigTpcTofKa1,
+          fNSigTpcTofPi2,
+          fNSigTpcTofPr2,
+        ]
+      - [
+          fImpactParameter0,
+          fImpactParameter1,
+          fImpactParameter2,
+          fCpa,
+          fChi2PCA,
+          fDecayLength,
+          fDecayLengthXY,
+          fNSigTpcTofPi0,
+          fNSigTpcTofPr0,
+          fNSigTpcTofKa1,
+          fNSigTpcTofPi2,
+          fNSigTpcTofPr2,
+        ]
+      - [
+          fImpactParameter0,
+          fImpactParameter1,
+          fImpactParameter2,
+          fCpa,
+          fChi2PCA,
+          fDecayLength,
+          fDecayLengthXY,
+          fNSigTpcTofPi0,
+          fNSigTpcTofPr0,
+          fNSigTpcTofKa1,
+          fNSigTpcTofPi2,
+          fNSigTpcTofPr2,
+        ]
+      - [
+          fImpactParameter0,
+          fImpactParameter1,
+          fImpactParameter2,
+          fCpa,
+          fChi2PCA,
+          fDecayLength,
+          fDecayLengthXY,
+          fNSigTpcTofPi0,
+          fNSigTpcTofPr0,
+          fNSigTpcTofKa1,
+          fNSigTpcTofPi2,
+          fNSigTpcTofPr2,
+        ]
     var_boundaries: [fCosThetaStar, fPtProng]
     var_correlation:
       - [fCosThetaStar] # TODO: update
@@ -238,7 +463,7 @@ LcJet_pp:
       seedmerge: [12] #list of periods
       period: [LHC23] #list of periods
       select_period: [1]
-      prefix_dir: /data2/MLhep/real/train_357222/
+      prefix_dir: /data2/MLhep/trains/369622/
       unmerged_tree_dir: [alice] #list of periods
       pkl: ["${USER}/lcjet/pkl"] #list of periods
       pkl_skimmed: ["${USER}/lcjet/pklsk"] #list of periods
@@ -255,7 +480,7 @@ LcJet_pp:
       seedmerge: [12] #list of periods
       period: [LHC24h1] #list of periods
       select_period: [1]
-      prefix_dir: /data2/MLhep/sim/train_352826/
+      prefix_dir: /data2/MLhep/trains/368429/
       unmerged_tree_dir: [alice]
       pkl: ["${USER}/lcjet/pkl"] #list of periods
       pkl_skimmed: ["${USER}/lcjet/pklsk"] #list of periods
@@ -276,7 +501,12 @@ LcJet_pp:
     equalise_sig_bkg: True
     # sampletagforsignal: 1
     # sampletagforbkg: 0
-    sel_ml: [fM < 2.22 or fM > 2.35, ismcsignal == 1 and ismcprompt == 1, ismcsignal == 1 and ismcfd == 1]
+    sel_ml:
+      [
+        fM < 2.22 or fM > 2.35,
+        ismcsignal == 1 and ismcprompt == 1,
+        ismcsignal == 1 and ismcfd == 1,
+      ]
     class_labels: [bkg, prompt, non-prompt]
     nkfolds: 5
     rnd_shuffle: 12
@@ -312,11 +542,15 @@ LcJet_pp:
     data:
       prefix_dir_app: "/data2/${USER}/"
       pkl_skimmed_dec: [LHC23pp/MLapplication/prod_LHC23/skpkldecdata] #list of periods
-      pkl_skimmed_decmerged: [LHC23pp/MLapplication/prod_LHC23/skpkldecdatamerged] #list of periods
+      pkl_skimmed_decmerged: [
+          LHC23pp/MLapplication/prod_LHC23/skpkldecdatamerged,
+        ] #list of periods
     mc:
       prefix_dir_app: "/data2/${USER}/"
       pkl_skimmed_dec: [LHC23pp_mc/MLapplication/prod_LHC24h1/skpkldecmc] #list of periods
-      pkl_skimmed_decmerged: [LHC23pp_mc/MLapplication/prod_LHC24h1/skpkldecmcmerged] #list of periods
+      pkl_skimmed_decmerged: [
+          LHC23pp_mc/MLapplication/prod_LHC24h1/skpkldecmcmerged,
+        ] #list of periods
     modelname: xgboost
     modelsperptbin: # sel_skim_binmin bins
       - xgboost_classifierLcpKpi_dfselection_fPt_1.0_2.0.sav
@@ -330,9 +564,42 @@ LcJet_pp:
       - xgboost_classifierLcpKpi_dfselection_fPt_10.0_12.0.sav
       - xgboost_classifierLcpKpi_dfselection_fPt_12.0_24.0.sav
     probcutpresel:
-      data: [[0.02, 0.0, 0.0], [0.03, 0.0, 0.0], [0.04, 0.0, 0.0], [0.07, 0.0, 0.0], [0.09, 0.0, 0.0], [0.11, 0.0, 0.0], [0.15, 0.0, 0.0], [0.18, 0.0, 0.0], [0.25, 0.0, 0.0], [0.35, 0.0, 0.0]] #list of nbins
-      mc: [[0.02, 0.0, 0.0], [0.03, 0.0, 0.0], [0.04, 0.0, 0.0], [0.07, 0.0, 0.0], [0.09, 0.0, 0.0], [0.11, 0.0, 0.0], [0.15, 0.0, 0.0], [0.18, 0.0, 0.0], [0.25, 0.0, 0.0], [0.35, 0.0, 0.0]] #list of nbins
-    probcutoptimal: [[0.02, 0.0, 0.0], [0.03, 0.0, 0.0], [0.04, 0.0, 0.0], [0.07, 0.0, 0.0], [0.09, 0.0, 0.0], [0.11, 0.0, 0.0], [0.15, 0.0, 0.0], [0.18, 0.0, 0.0], [0.25, 0.0, 0.0], [0.35, 0.0, 0.0]] #list of nbins
+      data: [
+          [0.02, 0.0, 0.0],
+          [0.03, 0.0, 0.0],
+          [0.04, 0.0, 0.0],
+          [0.07, 0.0, 0.0],
+          [0.09, 0.0, 0.0],
+          [0.11, 0.0, 0.0],
+          [0.15, 0.0, 0.0],
+          [0.18, 0.0, 0.0],
+          [0.25, 0.0, 0.0],
+          [0.35, 0.0, 0.0],
+        ] #list of nbins
+      mc: [
+          [0.02, 0.0, 0.0],
+          [0.03, 0.0, 0.0],
+          [0.04, 0.0, 0.0],
+          [0.07, 0.0, 0.0],
+          [0.09, 0.0, 0.0],
+          [0.11, 0.0, 0.0],
+          [0.15, 0.0, 0.0],
+          [0.18, 0.0, 0.0],
+          [0.25, 0.0, 0.0],
+          [0.35, 0.0, 0.0],
+        ] #list of nbins
+    probcutoptimal: [
+        [0.02, 0.0, 0.0],
+        [0.03, 0.0, 0.0],
+        [0.04, 0.0, 0.0],
+        [0.07, 0.0, 0.0],
+        [0.09, 0.0, 0.0],
+        [0.11, 0.0, 0.0],
+        [0.15, 0.0, 0.0],
+        [0.18, 0.0, 0.0],
+        [0.25, 0.0, 0.0],
+        [0.35, 0.0, 0.0],
+      ] #list of nbins
 
   #region analysis
   analysis:
@@ -452,7 +719,7 @@ LcJet_pp:
         func_bkg: "expo"
         # par_start:
         # par_fix: {1: 2.286}
-        par_constrain: {1: [2.28, 2.29], 2: [.005, .03]}
+        par_constrain: { 1: [2.28, 2.29], 2: [.005, .03] }
         range: [2.08, 2.48]
 
       efficiency:
@@ -510,11 +777,15 @@ LcJet_pp:
         mc: null
       data: &data_out_default
         runselection: [null] #FIXME
-        results: ["/home/${USER}/mlhep/lcjet/jet_obs/default/default/data/results"] #list of periods
+        results: [
+            "/home/${USER}/mlhep/lcjet/jet_obs/default/default/data/results",
+          ] #list of periods
         resultsallp: "/home/${USER}/mlhep/lcjet/jet_obs/default/default/data/results_all"
       mc: &mc_out_default
         runselection: [null, null] #FIXME
-        results: ["/home/${USER}/mlhep/lcjet/jet_obs/default/default/mc/results"] #list of periods
+        results: [
+            "/home/${USER}/mlhep/lcjet/jet_obs/default/default/mc/results",
+          ] #list of periods
         resultsallp: "/home/${USER}/mlhep/lcjet/jet_obs/default/default/mc/results_all"
       data_proc: # alternative processor output used as the analyzer input
         <<: *data_out_default
@@ -525,21 +796,110 @@ LcJet_pp:
       sgnfunc: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # kGaus=0, k2Gaus=1, k2GausSigmaRatioPar=2 (sel_an_binmin bins)
       bkgfunc: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # kExpo=0, kLin=1, kPol2=2, kNoBk=3, kPow=4, kPowEx=5 (sel_an_binmin bins)
       masspeak: 2.286
-      massmin: [1.66, 1.66, 1.66, 1.66, 1.66, 1.66, 1.66, 1.66, 1.66, 1.66, 1.66, 1.66] # sel_an_binmin bins, fit region of the invariant mass distribution [GeV/c^2]
-      massmax: [2.06, 2.06, 2.06, 2.06, 2.06, 2.06, 2.06, 2.06, 2.06, 2.06, 2.06, 2.06] # sel_an_binmin bins, fit region of the invariant mass distribution [GeV/c^2]
+      massmin: [
+          1.66,
+          1.66,
+          1.66,
+          1.66,
+          1.66,
+          1.66,
+          1.66,
+          1.66,
+          1.66,
+          1.66,
+          1.66,
+          1.66,
+        ] # sel_an_binmin bins, fit region of the invariant mass distribution [GeV/c^2]
+      massmax: [
+          2.06,
+          2.06,
+          2.06,
+          2.06,
+          2.06,
+          2.06,
+          2.06,
+          2.06,
+          2.06,
+          2.06,
+          2.06,
+          2.06,
+        ] # sel_an_binmin bins, fit region of the invariant mass distribution [GeV/c^2]
       rebin: [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6] # sel_an_binmin bins
-      fix_mean: [false, false, false, false, false, false, false, false, false, false, false, false] # sel_an_binmin bins
+      fix_mean: [
+          false,
+          false,
+          false,
+          false,
+          false,
+          false,
+          false,
+          false,
+          false,
+          false,
+          false,
+          false,
+        ] # sel_an_binmin bins
       masspeaksec: 2.286
 
       # If SetArraySigma true: sigma_initial is taken from sigmaarray; false: sigma_initial is taken from MC
       # If SetFixGaussianSigma true: sigma fixed to sigma_initial
       # SetFixGaussianSigma: [false, false, false, false, false, false, false, false, false, false, false, false] # sel_an_binmin bins
-      SetFixGaussianSigma: [true, true, true, true, true, true, true, true, true, true, true] # sel_an_binmin bins
-      SetArraySigma: [false, false, false, false, false, false, false, false, false, false, false, false] # sel_an_binmin bins
-      sigmaarray: [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01] # initial sigma (sel_an_binmin bins)
+      SetFixGaussianSigma: [
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+          true,
+        ] # sel_an_binmin bins
+      SetArraySigma: [
+          false,
+          false,
+          false,
+          false,
+          false,
+          false,
+          false,
+          false,
+          false,
+          false,
+          false,
+          false,
+        ] # sel_an_binmin bins
+      sigmaarray: [
+          0.01,
+          0.01,
+          0.01,
+          0.01,
+          0.01,
+          0.01,
+          0.01,
+          0.01,
+          0.01,
+          0.01,
+          0.01,
+          0.01,
+        ] # initial sigma (sel_an_binmin bins)
 
       fix_sigmasec: [true, true, true, true, true, true, true, true, true, true] # sel_an_binmin bins
-      sigmaarraysec: [0.007497, 0.007497, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01] # sel_an_binmin bins
+      sigmaarraysec: [
+          0.007497,
+          0.007497,
+          0.01,
+          0.01,
+          0.01,
+          0.01,
+          0.01,
+          0.01,
+          0.01,
+          0.01,
+          0.01,
+        ] # sel_an_binmin bins
       use_reflections: true
       # simple fitter END
 
@@ -562,17 +922,41 @@ LcJet_pp:
 
       powheg_path_prompt: /data/POWHEG/trees_powheg_pr_central.root
       powheg_prompt_variations_path: /data/POWHEG/trees_powheg_pr_
-      powheg_prompt_variations: ["F1_R05", "F05_R1", "F2_R1", "F1_R2", "F2_R2", "F05_R05", "Mhigh", "Mlow"]
+      powheg_prompt_variations:
+        [
+          "F1_R05",
+          "F05_R1",
+          "F2_R1",
+          "F1_R2",
+          "F2_R2",
+          "F05_R05",
+          "Mhigh",
+          "Mlow",
+        ]
 
       pythia8_prompt_variations_path: /data/PYTHIA8/trees_pythia8_pr_
       pythia8_prompt_variations: ["default", "charm_lo"] #["default","colour0soft"]
-      pythia8_prompt_variations_legend: ["PYTHIA 8 (Monash)", "PYTHIA 8 charm LO"] # ["PYTHIA 8 (Monash)","PYTHIA 8 SoftQCD, mode 0"]
+      pythia8_prompt_variations_legend: [
+          "PYTHIA 8 (Monash)",
+          "PYTHIA 8 charm LO",
+        ] # ["PYTHIA 8 (Monash)","PYTHIA 8 SoftQCD, mode 0"]
 
       variations_db: database_variations_LcJet_pp_jet_obs.yml
 
       # Additional cuts applied before mass histogram is filled
       use_cuts: True
-      cuts: ["mlBkgScore < 0.03", "mlBkgScore < 0.04", "mlBkgScore < 0.07", "mlBkgScore < 0.09", "mlBkgScore < 0.11", "mlBkgScore < 0.15", "mlBkgScore < 0.18", "mlBkgScore < 0.25", "mlBkgScore < 0.35", "mlBkgScore < 0.35"] # (sel_an_binmin bins)
+      cuts: [
+          "mlBkgScore < 0.03",
+          "mlBkgScore < 0.04",
+          "mlBkgScore < 0.07",
+          "mlBkgScore < 0.09",
+          "mlBkgScore < 0.11",
+          "mlBkgScore < 0.15",
+          "mlBkgScore < 0.18",
+          "mlBkgScore < 0.25",
+          "mlBkgScore < 0.35",
+          "mlBkgScore < 0.35",
+        ] # (sel_an_binmin bins)
       #mult_cuts: ["fCentFT0M >= 70 and fCentFT0M <= 100", "fCentFT0M >= 70 and fCentFT0M <= 100", "fCentFT0M >= 70 and fCentFT0M <= 100", "fCentFT0M >= 70 and fCentFT0M <= 100", "fCentFT0M >= 70 and fCentFT0M <= 100", "fCentFT0M >= 70 and fCentFT0M <= 100", "fCentFT0M >= 70 and fCentFT0M <= 100", "fCentFT0M >= 70 and fCentFT0M <= 100", "fCentFT0M >= 70 and fCentFT0M <= 100", "fCentFT0M >= 70 and fCentFT0M <= 100"] # (sel_an_binmin bins) #low mult
       #mult_cuts: ["fCentFT0M >= 20 and fCentFT0M <= 70", "fCentFT0M >= 20 and fCentFT0M <= 70", "fCentFT0M >= 20 and fCentFT0M <= 70", "fCentFT0M >= 20 and fCentFT0M <= 70", "fCentFT0M >= 20 and fCentFT0M <= 70", "fCentFT0M >= 20 and fCentFT0M <= 70", "fCentFT0M >= 20 and fCentFT0M <= 70", "fCentFT0M >= 20 and fCentFT0M <= 70", "fCentFT0M >= 20 and fCentFT0M <= 70", "fCentFT0M >= 20 and fCentFT0M <= 70"] # (sel_an_binmin bins)  #intermediate mult
       #mult_cuts: ["fCentFT0M >= 0 and fCentFT0M <= 20", "fCentFT0M >= 0 and fCentFT0M <= 20", "fCentFT0M >= 0 and fCentFT0M <= 20", "fCentFT0M >= 0 and fCentFT0M <= 20", "fCentFT0M >= 0 and fCentFT0M <= 20", "fCentFT0M >= 0 and fCentFT0M <= 20", "fCentFT0M >= 0 and fCentFT0M <= 20", "fCentFT0M >= 0 and fCentFT0M <= 20", "fCentFT0M >= 0 and fCentFT0M <= 20", "fCentFT0M >= 0 and fCentFT0M <= 20"] # (sel_an_binmin bins) #high mult

From 53a0efa4ac47e9288a4e686f5d9b40130c6b8249 Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Tue, 25 Mar 2025 09:35:09 +0100
Subject: [PATCH 13/20] Establish runnable version with POWHEG

---
 .../analysis/analyzer_jets.py                 |  8 ++---
 .../database_ml_parameters_D0Jet_pp.yml       | 31 ++++++++++++++-----
 machine_learning_hep/multiprocesser.py        |  1 -
 machine_learning_hep/processer.py             | 19 ++++++------
 machine_learning_hep/processer_jet.py         |  9 ++----
 5 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py
index 941957e327..6d3cab5849 100644
--- a/machine_learning_hep/analysis/analyzer_jets.py
+++ b/machine_learning_hep/analysis/analyzer_jets.py
@@ -212,7 +212,7 @@ def qa(self):  # pylint: disable=invalid-name
     # region efficiency
     # pylint: disable=too-many-statements
     def calculate_efficiencies(self):
-        self.logger.info("Calculating efficiencies")
+        self.logger.info("Calculating efficiencies from %s", self.n_fileeff)
         cats = {"pr", "np"}
         with TFile(self.n_fileeff) as rfile:
             h_gen = {cat: rfile.Get(f"h_ptjet-pthf_{cat}_gen") for cat in cats}
@@ -1113,8 +1113,8 @@ def estimate_feeddown(self):
                         self.bins_candpt,
                         bins_obs[var],
                     )
-                    fill_hist_fast(h3_fd_gen_orig, df[["pt_jet", "pt_cand", f"{colname}"]])
-                    self._save_hist(project_hist(h3_fd_gen_orig, [0, 2], {}), f"fd/h_ptjet-{var}_feeddown_gen_noeffscaling.png")
+                    fill_hist_fast(h3_fd_gen_orig[var], df[["pt_jet", "pt_cand", f"{colname}"]])
+                    self._save_hist(project_hist(h3_fd_gen_orig[var], [0, 2], {}), f"fd/h_ptjet-{var}_feeddown_gen_noeffscaling.png")
 
             case "sim":
                 # TODO: recover cross section
@@ -1177,7 +1177,7 @@ def estimate_feeddown(self):
             h_fd_det = project_hist(h3_fd_det, [0, 2], {})
 
             # old method
-            h3_fd_gen = h3_fd_gen_orig.Clone()
+            h3_fd_gen = h3_fd_gen_orig[var].Clone()
             ensure_sumw2(h3_fd_gen)
             for ipt in range(get_nbins(h3_fd_gen, 1)):
                 eff_pr = self.hcandeff["pr"].GetBinContent(ipt + 1)
diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
index b23ac85b40..67055a24fc 100644
--- a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
+++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
@@ -51,12 +51,27 @@ D0Jet_pp:
   #region dfs
   dfs:
     read:
-      evtorig:
+      evtorig_mc:
+        level: mc
+        index: fIndexHfD0CollBases
+        trees:
+          O2hfd0collbase: [fNumContrib, fPosZ]
+        filter: "abs(fPosZ) < 10."
+      evtorig_data:
         level: data
-        index: fIndexHfD0McCollBases
+        index: fIndexHfD0CollBases
         trees:
           O2hfd0collbase: [fNumContrib, fPosZ]
         filter: "abs(fPosZ) < 10."
+      evtorig_fd:
+        level: fd
+        index: fIndexHFD0MCCOLLBASES
+        trees:
+          O2hfd0mccollbase: [fPosZ]
+        filter: "abs(fPosZ) < 10."
+        
+      # powhegcoll:
+          # O2d0cmcpjetmcco: ...
       collcnt_mc:
         level: mc
         trees:
@@ -86,7 +101,7 @@ D0Jet_pp:
         trees:
           O2hfd0pbase: [fIndexHfD0McCollBases, fPt, fEta, fPhi, fFlagMcMatchGen, fOriginMcGen]
           O2d0cmcpjeto: [fIndexD0CMCPJETCOS, fJetPt, fJetPhi, fJetEta, fJetNConstituents, fJetR]
-          # O2d0cmcpjetmo: [fIndexArrayD0CMCDJETOS_hf, fIndexArrayD0CMCDJETOS_geo, fIndexArrayD0CMCDJETOS_pt]
+          O2d0cmcpjetmo: [fIndexArrayD0CMCDJETOS_hf, fIndexArrayD0CMCDJETOS_geo, fIndexArrayD0CMCDJETOS_pt]
           O2d0cmcpjetsso: [fEnergyMother, fPtLeading, fPtSubLeading, fTheta, fNSub2DR, fNSub1, fNSub2, fAngularity]
         extra:
           fY: log((sqrt(1.864**2 + (fPt * cosh(fEta))**2) + fPt * sinh(fEta)) / sqrt(1.864**2 + fPt**2)) #TODO : change mass or make sure Lc mass is updated
@@ -110,7 +125,6 @@ D0Jet_pp:
         trees:
           O2hfd0pbase: [fIndexHFD0MCCOLLBASES, fPt, fEta, fPhi, fFlagMcMatchGen, fOriginMcGen]
           O2d0cmcpjeto: [fIndexD0CMCPJETCOS, fJetPt, fJetPhi, fJetEta, fJetNConstituents, fJetR]
-          # O2d0cmcpjetmo: [fIndexArrayD0CMCDJETOS_hf, fIndexArrayD0CMCDJETOS_geo, fIndexArrayD0CMCDJETOS_pt]
           O2d0cmcpjetsso: [fEnergyMother, fPtLeading, fPtSubLeading, fTheta, fNSub2DR, fNSub1, fNSub2, fAngularity]
         extra:
           fY: log((sqrt(1.864**2 + (fPt * cosh(fEta))**2) + fPt * sinh(fEta)) / sqrt(1.864**2 + fPt**2)) #TODO : change mass or make sure Lc mass is updated
@@ -182,7 +196,7 @@ D0Jet_pp:
         filter: "fPt >= 1. and abs(fY) <= 0.8 and abs(fJetEta) < (.9 - (fJetR / 100.))" # TODO: check jet eta cut
 
     merge:
-      - {base: jetgen, ref: collgen}
+      - {base: jetgen, ref: evtorig_mc}
       - {base: jetfd, ref: collfd}
       - {base: jetdet, ref: colldet}
       - {base: jetdata, ref: colldata}
@@ -349,8 +363,8 @@ D0Jet_pp:
       seedmerge: [12] #list of periods
       period: [LHC24d3b] #list of periods
       select_period: [1]
-      prefix_dir: /home/jklein/mlhep_test/powheg/
-      unmerged_tree_dir: [d0-default/]
+      prefix_dir: /home/jklein/powheg/parallel/
+      unmerged_tree_dir: [default/]
       pkl: ["${USER}/d0jet/pkl"] #list of periods
       pkl_skimmed: ["${USER}/d0jet/pklsk"] #list of periods
       pkl_skimmed_merge_for_ml: ["${USER}/d0jet/pklskml"] #list of periods
@@ -432,6 +446,7 @@ D0Jet_pp:
   #region analysis
   analysis:
     anahptspectrum: "D0Kpi" #D0Kpi, DplusKpipi, DstarD0pi, DsKKpi, LctopKpi, LcK0Sp # used in analysis/analyzerdhadrons_mult.py
+    fd_input: "sim"
     fd_method: "Nb" #fc, Nb
     cctype: "pp"
     inputfonllpred: data/fonll/D0DplusDstarPredictions_13TeV_y05_all_300416_BDShapeCorrected.root # used in machine_learning_hep/hf_pt_spectrum.py
@@ -442,7 +457,7 @@ D0Jet_pp:
       sel_an_binmax: [2, 3, 4, 5, 6, 7, 8, 10, 12, 16, 24, 48] # hadron pt bins (sel_an_binmin bins) # FIXME: move the last edge in sel_an_binmin
       bins_ptjet: [5, 7, 10, 15, 20, 30, 40, 60, 70] # systematics, TODO: split rec and gen binning
       bins_ptjet_eff: [2, 5, 7, 10, 15, 20, 30, 40, 60, 70, 90] # systematics, TODO: split rec and gen binning
-      cand_collidx: fIndexHfD0CollBases
+      # cand_collidx: fIndexHfD0CollBases
       counter_read_data: fReadCountsWithTVXAndZVertexAndSel8
       counter_read_mc: fReadCountsWithTVXAndZVertexAndSelMC
       counter_tvx: fReadCountsWithTVX
diff --git a/machine_learning_hep/multiprocesser.py b/machine_learning_hep/multiprocesser.py
index 034431b43a..063397ce08 100755
--- a/machine_learning_hep/multiprocesser.py
+++ b/machine_learning_hep/multiprocesser.py
@@ -70,7 +70,6 @@ def __init__(self, case, proc_class, datap, typean, run_param, datatype):
 
         dp = datap["multi"][self.datatype.value]
         self.dlper_root = [self.d_prefix + os.path.expandvars(p) for p in dp["unmerged_tree_dir"]]
-        print('****', self.dlper_root, flush=True)
         self.dlper_pkl = [self.d_prefix + os.path.expandvars(p) for p in dp["pkl"]]
         self.dlper_pklsk = [self.d_prefix + os.path.expandvars(p) for p in dp["pkl_skimmed"]]
         self.dlper_pklml = [self.d_prefix + os.path.expandvars(p) for p in dp["pkl_skimmed_merge_for_ml"]]
diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py
index d743798cdd..86ad85a246 100644
--- a/machine_learning_hep/processer.py
+++ b/machine_learning_hep/processer.py
@@ -88,7 +88,6 @@ def __init__(
         # directories
         self.d_prefix_ml = datap["ml"].get("prefix_dir_ml", "")
         self.d_root = d_root
-        print('%%%%', self.d_root, flush=True)
         self.d_pkl = d_pkl
         self.d_pklsk = d_pklsk
         self.d_pkl_ml = d_pkl_ml
@@ -178,7 +177,6 @@ def __init__(
 
         # list of files names
         if os.path.isdir(self.d_root):
-            print(f"{self.d_root=}", flush=True)
             self.l_path = list_folders(self.d_root, self.n_root, self.p_maxfiles, self.select_jobs)
         elif glob.glob(f"{self.d_pkl}/**/{self.n_reco}", recursive=True):
             self.l_path = list_folders(self.d_pkl, self.n_reco, self.p_maxfiles, self.select_jobs)
@@ -187,7 +185,6 @@ def __init__(
                 ".p", "_%s%d_%d.p" % (self.v_var_binning, self.lpt_anbinmin[0], self.lpt_anbinmax[0])
             )
             self.l_path = list_folders(self.d_pklsk, self.n_sk, self.p_maxfiles, self.select_jobs)
-        print(f"{self.l_path=}", flush=True)
 
         self.l_root = createlist(self.d_root, self.l_path, self.n_root)
         self.l_reco = createlist(self.d_pkl, self.l_path, self.n_reco)
@@ -264,7 +261,6 @@ def __init__(
         self.mptfiles_gensk_sl = []
 
         self.d_pkl_decmerged = d_pkl_decmerged
-        print(self.d_results, self.n_filemass, flush=True)
         self.n_filemass = os.path.join(self.d_results, self.n_filemass)
         self.n_fileeff = os.path.join(self.d_results, self.n_fileeff)
         self.n_fileresp = os.path.join(self.d_results, self.n_fileresp)
@@ -352,7 +348,7 @@ def __init__(
 
         # Analysis cuts (loaded in self.process_histomass)
         self.analysis_cuts = None
-        self.analysis_mult_cuts = None
+        self.analysis_multcuts = None
         # Flag if they should be used
         self.do_custom_analysis_cuts = datap["analysis"][self.typean].get("use_cuts", False)
 
@@ -429,7 +425,11 @@ def dfuse(df_spec):
         with uproot.open(self.l_root[file_index]) as rfile:
             df_processed = set()
             keys = rfile.keys(recursive=False, filter_name="DF_*")
-            self.logger.info("found %d dataframes, reading %s", len(keys), max_no_keys or "all")
+            if len(keys) == 0:
+                self.logger.error("no dataframes found in %s", self.l_root[file_index])
+                return
+            else:
+                self.logger.info("found %d dataframes, reading %s", len(keys), max_no_keys or "all")
             for idx, key in enumerate(keys[:max_no_keys]):
                 if not (df_key := re.match("^DF_(\\d+);", key)):
                     continue
@@ -533,17 +533,16 @@ def dfuse(df_spec):
         if self.df_write:
             for df_name, df_spec in self.df_write.items():
                 if dfuse(df_spec):
-                    self.logger.info("writing %s to %s", df_name, df_spec["file"])
                     src = df_spec.get("source", df_name)
                     dfo = dfquery(dfs[src], df_spec.get("filter", None))
                     path = os.path.join(self.d_pkl, self.l_path[file_index], df_spec["file"])
+                    self.logger.info("writing %s to %s with info %s", df_name, path, dfo.info())
                     write_df(dfo, path)
 
     def skim(self, file_index):
         dfreco = read_df(self.l_reco[file_index]) if self.datatype != "fd" else None
         dfgen = read_df(self.l_gen[file_index]) if self.datatype in ('mc', 'fd') else None
         dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.n_gen_sl and self.datatype in ('mc', 'fd') else None
-        print(dfreco, dfgen, dfgen_sl, flush=True)
 
         for ipt in range(self.p_nptbins):
             if dfreco is not None:
@@ -554,7 +553,6 @@ def skim(self, file_index):
             if dfgen is not None:
                 dfgensk = seldf_singlevar(dfgen, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt])
                 dfgensk = dfquery(dfgensk, self.s_gen_skim[ipt])
-                print(self.mptfiles_gensk, flush=True)
                 write_df(dfgensk, self.mptfiles_gensk[ipt][file_index])
 
             if dfgen_sl is not None:
@@ -620,7 +618,7 @@ def process_unpack_par(self):
         self.logger.info("Unpacking %s period %s", self.datatype, self.period)
         create_folder_struc(self.d_pkl, self.l_path)
         arguments = [(i,) for i in range(len(self.l_root))]
-        self.logger.info("d_pkl: %s, l_path: %s, arguments: %s", self.d_pkl, str(self.l_path), str(arguments))
+        # self.logger.info("d_pkl: %s, l_path: %s, arguments: %s", self.d_pkl, str(self.l_path), str(arguments))
         self.parallelizer(self.unpack, arguments, self.p_chunksizeunp)
 
     def process_skim_par(self):
@@ -747,6 +745,7 @@ def process_histomass(self):
 
     def process_efficiency(self):
         print("Doing efficiencies", self.datatype, self.period)
+        self.load_cuts()
         print("Using run selection for eff histo", self.runlistrigger, "for period", self.period)
         if self.doml is True:
             print("Doing ml analysis")
diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py
index b9eb660f24..ee39cba695 100644
--- a/machine_learning_hep/processer_jet.py
+++ b/machine_learning_hep/processer_jet.py
@@ -161,7 +161,7 @@ def _verify_variables(self, dfi):
                 print(dfi[~mask][var], flush=True)
 
     def _calculate_variables(self, df, verify=False):  # pylint: disable=invalid-name
-        self.logger.info("calculating variables for DF %s", df.info())
+        self.logger.info("calculating variables")
         observables = self.cfg("observables", {})
         if len(df) == 0:
             for obs in observables:
@@ -279,12 +279,7 @@ def process_histomass_single(self, index):
                 self.binarray_ptjet,
                 self.binarray_pthf,
             )
-            print(f"Filling and writing histogram from {df.head()}", flush=True)
             fill_hist(h, df[["fM", "fJetPt", "fPt"]], write=True)
-            h.Write()
-            print("WRITING!!!!!!!!!!!!!!!")
-            rfile.WriteObject(h)
-            # h.Print("all")
 
             for sel_name, sel_spec in self.cfg("data_selections", {}).items():
                 if sel_spec["level"] == self.datatype:
@@ -312,7 +307,6 @@ def process_histomass_single(self, index):
                         df["idx_match"] = df[idx].apply(lambda ar: ar[0] if len(ar) > 0 else -1)
                         dfquery(df, "idx_match >= 0", inplace=True)
 
-            print(f"calculation", flush=True)
             self._calculate_variables(df)
 
             for obs, spec in self.cfg("observables", {}).items():
@@ -495,6 +489,7 @@ def process_efficiency_single(self, index):
             }
 
             for cat in cats:
+                print(f"Filling histograms for {cat}: {dfgen[cat].info()}, {dfdet[cat].info()}, {dfmatch[cat].info()}", flush=True)
                 fill_hist(h_eff[(cat, "gen")], dfgen[cat][["fJetPt_gen", "fPt_gen"]])
                 fill_hist(h_eff[(cat, "det")], dfdet[cat][["fJetPt", "fPt"]])
                 if cat in dfmatch and dfmatch[cat] is not None:

From 4ecc76f4f18ee7169117953411d7aebc40b4f53c Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Thu, 10 Apr 2025 14:24:17 +0200
Subject: [PATCH 14/20] Fix minor issues

---
 machine_learning_hep/analysis/analyzer_jets.py        | 11 ++++-------
 .../data_run3/database_ml_parameters_D0Jet_pp.yml     |  8 ++++----
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py
index 6d3cab5849..5d9eeb52ec 100644
--- a/machine_learning_hep/analysis/analyzer_jets.py
+++ b/machine_learning_hep/analysis/analyzer_jets.py
@@ -589,9 +589,6 @@ def fit(self):
                         if iptjet is None:
                             if not fitcfg.get("per_ptjet"):
                                 for jptjet in range(get_nbins(h, 1)):
-                                    self.logger.info(
-                                        "Overwriting roows_ptjet for %s iptjet %s ipt %d", level, jptjet, ipt
-                                    )
                                     self.roows[(jptjet, ipt)] = roo_ws.Clone()
                                     self.roo_ws[(level, jptjet, ipt)] = roo_ws.Clone()
                             if level in ("data", "mc"):
@@ -705,10 +702,7 @@ def _subtract_sideband(self, hist, var, mcordata, ipt):
             self.logger.info("Scaling sidebands in ptjet-%s bins: %s using %s", label, bins_ptjet, fh_sideband)
             hx = project_hist(fh_sideband, (0,), {}) if get_dim(fh_sideband) > 1 else fh_sideband
             for iptjet in bins_ptjet:
-                if iptjet:
-                    n = hx.GetBinContent(iptjet)
-                    self.logger.info("Need to scale in ptjet %i: %g", iptjet, n)
-                    if n <= 0:
+                if iptjet and hx.GetBinContent(iptjet) <= 0:
                         continue
                 rws = self.roo_ws.get((mcordata, iptjet, ipt))
                 if not rws:
@@ -1071,6 +1065,7 @@ def estimate_feeddown(self):
         """Estimate feeddown from legacy Run 2 trees or gen-only simulation"""
         match self.cfg("fd_input", "tree"):
             case "tree":
+                self.logger.info("Reading feeddown information from trees")
                 with TFile(self.cfg("fd_root")) as rfile:
                     powheg_xsection = rfile.Get("fHistXsection")
                     powheg_xsection_scale_factor = powheg_xsection.GetBinContent(1) / powheg_xsection.GetEntries()
@@ -1126,6 +1121,8 @@ def estimate_feeddown(self):
                         if fh := rfile.Get(f"h_mass-ptjet-pthf{label}"):
                             h3_fd_gen_orig[var] = project_hist(fh, list(range(1, get_dim(fh))), {})
                             ensure_sumw2(h3_fd_gen_orig[var])
+                            self._save_hist(project_hist(h3_fd_gen_orig[var], [0, 2], {}), f"fd/h_ptjet-{var}_feeddown_genonly_noeffscaling.png")
+                powheg_xsection_scale_factor = 1. # FIXME: retrieve cross section
 
             case fd_input:
                 self.logger.critical("Invalid feeddown input %s", fd_input)
diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
index 67055a24fc..41d004d521 100644
--- a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
+++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
@@ -53,7 +53,7 @@ D0Jet_pp:
     read:
       evtorig_mc:
         level: mc
-        index: fIndexHfD0CollBases
+        index: fIndexHfD0McCollBases
         trees:
           O2hfd0collbase: [fNumContrib, fPosZ]
         filter: "abs(fPosZ) < 10."
@@ -227,11 +227,11 @@ D0Jet_pp:
         
       evtorig_data:
         level: data
-        source: evtorig
+        source: colldata
         file: AnalysisResultsEvtOrig.parquet
       evt_data:
         level: data
-        source: evtorig
+        source: colldata
         file: AnalysisResultsEvt.parquet
         
       evtorig_fd:
@@ -446,7 +446,6 @@ D0Jet_pp:
   #region analysis
   analysis:
     anahptspectrum: "D0Kpi" #D0Kpi, DplusKpipi, DstarD0pi, DsKKpi, LctopKpi, LcK0Sp # used in analysis/analyzerdhadrons_mult.py
-    fd_input: "sim"
     fd_method: "Nb" #fc, Nb
     cctype: "pp"
     inputfonllpred: data/fonll/D0DplusDstarPredictions_13TeV_y05_all_300416_BDShapeCorrected.root # used in machine_learning_hep/hf_pt_spectrum.py
@@ -808,6 +807,7 @@ D0Jet_pp:
         use_matched: true
 
       frac_mcana: .2 # fraction of MC sample for the closure
+      fd_input: "sim"
       fd_root: "/data2/vkucera/powheg/trees_powheg_fd_central.root" # systematics
       fd_parquet: "/data2/jklein/powheg/trees_powheg_fd_central.parquet" # systematics
 

From 5f21f00b4cea7ff07ab7c44c2619a1309302138c Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Thu, 10 Apr 2025 14:30:58 +0200
Subject: [PATCH 15/20] Fix technical issues

---
 .../analysis/analyzer_jets.py                 | 20 ++++++++++------
 machine_learning_hep/processer_jet.py         | 24 +++++++++++++------
 2 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py
index 5d9eeb52ec..ff922f0265 100644
--- a/machine_learning_hep/analysis/analyzer_jets.py
+++ b/machine_learning_hep/analysis/analyzer_jets.py
@@ -117,7 +117,7 @@ def __init__(self, datap, case, typean, period):
                 for param, symbol in zip(
                     ("mean", "sigma", "significance", "chi2"),
                     ("#it{#mu}", "#it{#sigma}", "significance", "#it{#chi}^{2}"),
-                    strict=False,
+                    strict=True,
                 )
             }
             for level in self.fit_levels
@@ -703,7 +703,7 @@ def _subtract_sideband(self, hist, var, mcordata, ipt):
             hx = project_hist(fh_sideband, (0,), {}) if get_dim(fh_sideband) > 1 else fh_sideband
             for iptjet in bins_ptjet:
                 if iptjet and hx.GetBinContent(iptjet) <= 0:
-                        continue
+                    continue
                 rws = self.roo_ws.get((mcordata, iptjet, ipt))
                 if not rws:
                     self.logger.error("Falling back to incl. roows for %s-iptjet%i-ipt%i", mcordata, iptjet, ipt)
@@ -1060,7 +1060,6 @@ def _extract_signal(self, hist, var, mcordata, ipt):
         # hres.Sumw2() # TODO: check if we should do this here
         return hres
 
-
     def estimate_feeddown(self):
         """Estimate feeddown from legacy Run 2 trees or gen-only simulation"""
         match self.cfg("fd_input", "tree"):
@@ -1096,7 +1095,9 @@ def estimate_feeddown(self):
                     colname = col_mapping.get(var, f"{var}_jet")
                     if f"{colname}" not in df:
                         if var is not None:
-                            self.logger.error("No feeddown information for %s (%s), cannot estimate feeddown", var, colname)
+                            self.logger.error(
+                                "No feeddown information for %s (%s), cannot estimate feeddown", var, colname
+                            )
                             # print(df.info(), flush=True)
                         continue
 
@@ -1109,7 +1110,9 @@ def estimate_feeddown(self):
                         bins_obs[var],
                     )
                     fill_hist_fast(h3_fd_gen_orig[var], df[["pt_jet", "pt_cand", f"{colname}"]])
-                    self._save_hist(project_hist(h3_fd_gen_orig[var], [0, 2], {}), f"fd/h_ptjet-{var}_feeddown_gen_noeffscaling.png")
+                    self._save_hist(
+                        project_hist(h3_fd_gen_orig[var], [0, 2], {}), f"fd/h_ptjet-{var}_feeddown_gen_noeffscaling.png"
+                    )
 
             case "sim":
                 # TODO: recover cross section
@@ -1121,8 +1124,11 @@ def estimate_feeddown(self):
                         if fh := rfile.Get(f"h_mass-ptjet-pthf{label}"):
                             h3_fd_gen_orig[var] = project_hist(fh, list(range(1, get_dim(fh))), {})
                             ensure_sumw2(h3_fd_gen_orig[var])
-                            self._save_hist(project_hist(h3_fd_gen_orig[var], [0, 2], {}), f"fd/h_ptjet-{var}_feeddown_genonly_noeffscaling.png")
-                powheg_xsection_scale_factor = 1. # FIXME: retrieve cross section
+                            self._save_hist(
+                                project_hist(h3_fd_gen_orig[var], [0, 2], {}),
+                                f"fd/h_ptjet-{var}_feeddown_genonly_noeffscaling.png",
+                            )
+                powheg_xsection_scale_factor = 1.0  # FIXME: retrieve cross section
 
             case fd_input:
                 self.logger.critical("Invalid feeddown input %s", fd_input)
diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py
index ee39cba695..8a0c345c54 100644
--- a/machine_learning_hep/processer_jet.py
+++ b/machine_learning_hep/processer_jet.py
@@ -83,8 +83,10 @@ def __init__(
         self.s_evtsel = datap["analysis"][self.typean]["evtsel"]
 
         # bins: 2d array [[low, high], ...]
-        self.bins_skimming = np.array(list(zip(self.lpt_anbinmin, self.lpt_anbinmax)), "d")  # TODO: replace with cfg
-        self.bins_analysis = np.array(list(zip(self.lpt_finbinmin, self.lpt_finbinmax)), "d")
+        self.bins_skimming = np.array(
+            list(zip(self.lpt_anbinmin, self.lpt_anbinmax, strict=True)), "d"
+        )  # TODO: replace with cfg
+        self.bins_analysis = np.array(list(zip(self.lpt_finbinmin, self.lpt_finbinmax, strict=True)), "d")
 
         # skimming bins in overlap with the analysis range
         self.active_bins_skim = [
@@ -143,7 +145,7 @@ def _verify_variables(self, dfi):
         for idx, row in df.iterrows():
             isSoftDropped = False
             nsd = 0
-            for zg, theta in zip(row["zg_array"], row["fTheta"]):
+            for zg, theta in zip(row["zg_array"], row["fTheta"], strict=True):
                 if zg >= self.cfg("zcut", 0.1):
                     if not isSoftDropped:
                         df.loc[idx, "zg"] = zg
@@ -178,7 +180,12 @@ def _calculate_variables(self, df, verify=False):  # pylint: disable=invalid-nam
                 df["zg"] = df["zg_array"].apply(lambda ar: next((zg for zg in ar if zg >= zcut), -0.1))
             if "rg" in observables:
                 df["rg"] = df[["zg_array", "fTheta"]].apply(
-                    (lambda ar: next((rg for (zg, rg) in zip(ar.zg_array, ar.fTheta) if zg >= zcut), -0.1)), axis=1
+                    (
+                        lambda ar: next(
+                            (rg for (zg, rg) in zip(ar.zg_array, ar.fTheta, strict=True) if zg >= zcut), -0.1
+                        )
+                    ),
+                    axis=1,
                 )
             if "nsd" in observables:
                 df["nsd"] = df["zg_array"].apply(lambda ar: len([zg for zg in ar if zg >= zcut]))
@@ -232,7 +239,7 @@ def process_histomass_single(self, index):
         self.logger.info("Processing (histomass) %s", self.l_evtorig[index])
 
         print(f"Opening file {self.l_histomass[index]}", flush=True)
-        with TFile.Open(self.l_histomass[index], "recreate") as rfile:
+        with TFile.Open(self.l_histomass[index], "recreate") as _:
             dfevtorig = read_df(self.l_evtorig[index])
             histonorm = TH1F("histonorm", "histonorm", 4, 0, 4)
             histonorm.SetBinContent(1, len(dfquery(dfevtorig, self.s_evtsel)))
@@ -256,7 +263,7 @@ def process_histomass_single(self, index):
             get_axis(histonorm, 0).SetBinLabel(4, "N_{BC}^{TVX}")
             histonorm.Write()
 
-            if self.datatype != 'fd':
+            if self.datatype != "fd":
                 df = pd.concat(read_df(self.mptfiles_recosk[bin][index]) for bin in self.active_bins_skim)
             else:
                 df = pd.concat(read_df(self.mptfiles_gensk[bin][index]) for bin in self.active_bins_skim)
@@ -489,7 +496,10 @@ def process_efficiency_single(self, index):
             }
 
             for cat in cats:
-                print(f"Filling histograms for {cat}: {dfgen[cat].info()}, {dfdet[cat].info()}, {dfmatch[cat].info()}", flush=True)
+                print(
+                    f"Filling histograms for {cat}: {dfgen[cat].info()}, {dfdet[cat].info()}, {dfmatch[cat].info()}",
+                    flush=True,
+                )
                 fill_hist(h_eff[(cat, "gen")], dfgen[cat][["fJetPt_gen", "fPt_gen"]])
                 fill_hist(h_eff[(cat, "det")], dfdet[cat][["fJetPt", "fPt"]])
                 if cat in dfmatch and dfmatch[cat] is not None:

From 316289e8051da36adc5ecc90e444742cedae1229 Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Thu, 10 Apr 2025 16:07:21 +0200
Subject: [PATCH 16/20] Fix technicalities

---
 machine_learning_hep/analysis/analyzer_jets.py | 6 ++----
 machine_learning_hep/workflow/workflow_base.py | 4 +++-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py
index ff922f0265..b111665286 100644
--- a/machine_learning_hep/analysis/analyzer_jets.py
+++ b/machine_learning_hep/analysis/analyzer_jets.py
@@ -86,9 +86,8 @@ def __init__(self, datap, case, typean, period):
         self.p_pdfnames = datap["analysis"][self.typean].get("pdf_names")
         self.p_param_names = datap["analysis"][self.typean].get("param_names")
 
-        # TODO: should come entirely from DB
         self.observables = {
-            "qa": ["zg", "rg", "nsd", "zpar", "dr", "lntheta", "lnkt", "lntheta-lnkt"],
+            "qa": [*self.cfg("observables", {})],
             "all": [*self.cfg("observables", {})],
         }
 
@@ -685,7 +684,6 @@ def _subtract_sideband(self, hist, var, mcordata, ipt):
             # project out the mass regions (first axis)
             axes = list(range(get_dim(hist)))[1:]
             fh[region] = project_hist(hist, axes, {0: bins[region]})
-            self.logger.info("Projecting %s to %s in %s: %g entries", hist, axes, bins[region], fh[region].GetEntries())
             self._save_hist(
                 fh[region], f"sideband/h_ptjet{label}_{region}_{string_range_pthf(range_pthf)}_{mcordata}.png"
             )
@@ -1128,7 +1126,7 @@ def estimate_feeddown(self):
                                 project_hist(h3_fd_gen_orig[var], [0, 2], {}),
                                 f"fd/h_ptjet-{var}_feeddown_genonly_noeffscaling.png",
                             )
-                powheg_xsection_scale_factor = 1.0  # FIXME: retrieve cross section
+                powheg_xsection_scale_factor = 0.  # FIXME: retrieve cross section
 
             case fd_input:
                 self.logger.critical("Invalid feeddown input %s", fd_input)
diff --git a/machine_learning_hep/workflow/workflow_base.py b/machine_learning_hep/workflow/workflow_base.py
index 57929513c7..a0870550f1 100644
--- a/machine_learning_hep/workflow/workflow_base.py
+++ b/machine_learning_hep/workflow/workflow_base.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 from os.path import join
+from typing import TypeVar
 
 # pylint: disable=import-error, no-name-in-module
 from ROOT import gStyle
@@ -37,7 +38,8 @@ def __init__(self, datap, case, typean, period=None):
         self.typean = typean
         self.period = period
 
-    def cfg(self, param, default=None):
+    T = TypeVar("T")
+    def cfg(self, param: str, default: T = None) -> T:
         return reduce(
             lambda d, key: d.get(key, default) if isinstance(d, dict) else default,
             param.split("."),

From 91b6ca1f2ea7f8b6440e9a24fcecacab1f52525a Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Mon, 28 Apr 2025 16:24:07 +0200
Subject: [PATCH 17/20] Propagate POWHEG weights to luminosity scaling

---
 machine_learning_hep/analysis/analyzer_jets.py    |  5 ++++-
 .../data_run3/database_ml_parameters_D0Jet_pp.yml |  9 +++++++--
 machine_learning_hep/processer.py                 | 15 +++++++++++----
 machine_learning_hep/processer_jet.py             | 10 +++++++++-
 4 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py
index b111665286..8c93dbabf5 100644
--- a/machine_learning_hep/analysis/analyzer_jets.py
+++ b/machine_learning_hep/analysis/analyzer_jets.py
@@ -1126,7 +1126,10 @@ def estimate_feeddown(self):
                                 project_hist(h3_fd_gen_orig[var], [0, 2], {}),
                                 f"fd/h_ptjet-{var}_feeddown_genonly_noeffscaling.png",
                             )
-                powheg_xsection_scale_factor = 0.  # FIXME: retrieve cross section
+                    h_norm = rfile.Get("histonorm")
+                    powheg_xsection_scale_factor = h_norm.GetBinContent(5)
+                    self.logger.info("powheg_xsection_scale_factor = %f", powheg_xsection_scale_factor)
+                    self.logger.info("POWHEG luminosity (mb^{-1}): %g", 1.0 / powheg_xsection_scale_factor)
 
             case fd_input:
                 self.logger.critical("Invalid feeddown input %s", fd_input)
diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
index 41d004d521..a8014409ed 100644
--- a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
+++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
@@ -70,8 +70,9 @@ D0Jet_pp:
           O2hfd0mccollbase: [fPosZ]
         filter: "abs(fPosZ) < 10."
         
-      # powhegcoll:
-          # O2d0cmcpjetmcco: ...
+      coll_powheg_fd:
+        trees:
+          O2d0cmcpjetmcco: [fEventWeight]
       collcnt_mc:
         level: mc
         trees:
@@ -234,6 +235,9 @@ D0Jet_pp:
         source: colldata
         file: AnalysisResultsEvt.parquet
         
+      coll_powheg_fd:
+        level: fd
+        file: AnalysisResultsPowheg.parquet
       evtorig_fd:
         level: fd
         source: collfd
@@ -311,6 +315,7 @@ D0Jet_pp:
     namefile_reco_applieddata: AnalysisResultsRecoAppliedData.parquet
     namefile_reco_appliedmc: AnalysisResultsRecoAppliedMC.parquet
     namefile_mcweights: mcweights.root
+    namefile_wgt: AnalysisResultsPowheg.parquet
     treeoutput: "D0tree"
     histofilename: "masshisto.root"
     efffilename: "effhisto.root"
diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py
index 86ad85a246..6358bf7c1f 100644
--- a/machine_learning_hep/processer.py
+++ b/machine_learning_hep/processer.py
@@ -141,6 +141,7 @@ def __init__(
         self.n_evt = datap["files_names"]["namefile_evt"]
         self.n_collcnt = datap["files_names"].get("namefile_collcnt")
         self.n_bccnt = datap["files_names"].get("namefile_bccnt")
+        self.n_wgt = datap["files_names"].get("namefile_wgt", None)
         self.n_evtorig = datap["files_names"].get("namefile_evtorig")
         self.n_evt_count_ml = datap["files_names"].get("namefile_evt_count", "evtcount.yaml")
         self.n_gen = datap["files_names"]["namefile_gen"]
@@ -193,6 +194,7 @@ def __init__(
         self.l_evtorig = createlist(self.d_pkl, self.l_path, self.n_evtorig)
         self.l_collcnt = createlist(self.d_pkl, self.l_path, self.n_collcnt) if self.datatype != "fd" else None
         self.l_bccnt = createlist(self.d_pkl, self.l_path, self.n_bccnt) if self.datatype != "fd" else None
+        self.l_wgt = createlist(self.d_pkl, self.l_path, self.n_wgt) if self.datatype == "fd" and self.n_wgt else None
         self.l_histomass = createlist(self.d_results, self.l_path, self.n_filemass)
         self.l_histoeff = createlist(self.d_results, self.l_path, self.n_fileeff)
         # self.l_historesp = createlist(self.d_results, self.l_path, self.n_fileresp)
@@ -376,6 +378,8 @@ def dfread(rdir, trees, cols, idx_name=None):
                     trees = [trees]
                     cols = [cols]
                 # if all(type(var) is str for var in vars): vars = [vars]
+                if not all((name in rdir for name in trees)):
+                    self.logger.critical("Missing trees: %s", trees)
                 df = None
                 for tree, col in zip([rdir[name] for name in trees], cols, strict=False):
                     try:
@@ -534,10 +538,13 @@ def dfuse(df_spec):
             for df_name, df_spec in self.df_write.items():
                 if dfuse(df_spec):
                     src = df_spec.get("source", df_name)
-                    dfo = dfquery(dfs[src], df_spec.get("filter", None))
-                    path = os.path.join(self.d_pkl, self.l_path[file_index], df_spec["file"])
-                    self.logger.info("writing %s to %s with info %s", df_name, path, dfo.info())
-                    write_df(dfo, path)
+                    if src in dfs:
+                        dfo = dfquery(dfs[src], df_spec.get("filter", None))
+                        path = os.path.join(self.d_pkl, self.l_path[file_index], df_spec["file"])
+                        self.logger.info("writing %s to %s with info %s", df_name, path, dfo.info())
+                        write_df(dfo, path)
+                    else:
+                        self.logger.error("could not write tree, missing source %s", src)
 
     def skim(self, file_index):
         dfreco = read_df(self.l_reco[file_index]) if self.datatype != "fd" else None
diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py
index 8a0c345c54..35b7311786 100644
--- a/machine_learning_hep/processer_jet.py
+++ b/machine_learning_hep/processer_jet.py
@@ -241,7 +241,7 @@ def process_histomass_single(self, index):
         print(f"Opening file {self.l_histomass[index]}", flush=True)
         with TFile.Open(self.l_histomass[index], "recreate") as _:
             dfevtorig = read_df(self.l_evtorig[index])
-            histonorm = TH1F("histonorm", "histonorm", 4, 0, 4)
+            histonorm = TH1F("histonorm", "histonorm", 5, 0, 5)
             histonorm.SetBinContent(1, len(dfquery(dfevtorig, self.s_evtsel)))
             if self.l_collcnt:
                 dfcollcnt = read_df(self.l_collcnt[index])
@@ -257,10 +257,18 @@ def process_histomass_single(self, index):
                 ser_bccnt = dfbccnt[self.cfg("counter_tvx")]
                 bccnt_tvx = functools.reduce(lambda x, y: float(x) + float(y), (ar[0] for ar in ser_bccnt))
                 histonorm.SetBinContent(4, bccnt_tvx)
+            if self.l_wgt:
+                self.logger.info("Filling event weights")
+                dfwgt = read_df(self.l_wgt[index])
+                print(dfwgt.info())
+                histonorm.SetBinContent(5, dfwgt["fEventWeight"].sum())
+            else:
+                self.logger.warning("No event weights found, empty list: %s", self.l_wgt)
             get_axis(histonorm, 0).SetBinLabel(1, "N_{evt}")
             get_axis(histonorm, 0).SetBinLabel(2, "N_{coll}")
             get_axis(histonorm, 0).SetBinLabel(3, "N_{coll}^{TVX}")
             get_axis(histonorm, 0).SetBinLabel(4, "N_{BC}^{TVX}")
+            get_axis(histonorm, 0).SetBinLabel(5, "w_{POWHEG}")
             histonorm.Write()
 
             if self.datatype != "fd":

From 303974678e587c7209de24c2d14afb678e887e4b Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Mon, 28 Apr 2025 17:08:45 +0200
Subject: [PATCH 18/20] Clean up code

---
 machine_learning_hep/analysis/analyzer_jets.py            | 1 -
 .../data/data_run3/database_ml_parameters_D0Jet_pp.yml    | 8 ++++----
 machine_learning_hep/processer.py                         | 1 -
 machine_learning_hep/processer_jet.py                     | 8 +-------
 4 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py
index 8c93dbabf5..4022b567e8 100644
--- a/machine_learning_hep/analysis/analyzer_jets.py
+++ b/machine_learning_hep/analysis/analyzer_jets.py
@@ -1113,7 +1113,6 @@ def estimate_feeddown(self):
                     )
 
             case "sim":
-                # TODO: recover cross section
                 h3_fd_gen_orig = {}
                 with TFile(self.n_filemass_fd) as rfile:
                     for var in self.observables["all"]:
diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
index a8014409ed..58df884f55 100644
--- a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
+++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
@@ -69,7 +69,7 @@ D0Jet_pp:
         trees:
           O2hfd0mccollbase: [fPosZ]
         filter: "abs(fPosZ) < 10."
-        
+
       coll_powheg_fd:
         trees:
           O2d0cmcpjetmcco: [fEventWeight]
@@ -216,7 +216,7 @@ D0Jet_pp:
       jetdata:
         level: data
         file: AnalysisResultsReco.parquet
-        
+
       evtorig_mc:
         level: mc
         source: collgen
@@ -225,7 +225,7 @@ D0Jet_pp:
         level: mc
         source: collgen
         file: AnalysisResultsEvt.parquet
-        
+
       evtorig_data:
         level: data
         source: colldata
@@ -234,7 +234,7 @@ D0Jet_pp:
         level: data
         source: colldata
         file: AnalysisResultsEvt.parquet
-        
+
       coll_powheg_fd:
         level: fd
         file: AnalysisResultsPowheg.parquet
diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py
index 6358bf7c1f..280ced430e 100644
--- a/machine_learning_hep/processer.py
+++ b/machine_learning_hep/processer.py
@@ -541,7 +541,6 @@ def dfuse(df_spec):
                     if src in dfs:
                         dfo = dfquery(dfs[src], df_spec.get("filter", None))
                         path = os.path.join(self.d_pkl, self.l_path[file_index], df_spec["file"])
-                        self.logger.info("writing %s to %s with info %s", df_name, path, dfo.info())
                         write_df(dfo, path)
                     else:
                         self.logger.error("could not write tree, missing source %s", src)
diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py
index 35b7311786..9492f7ed98 100644
--- a/machine_learning_hep/processer_jet.py
+++ b/machine_learning_hep/processer_jet.py
@@ -238,7 +238,6 @@ def split_df(self, dfi, frac):
     def process_histomass_single(self, index):
         self.logger.info("Processing (histomass) %s", self.l_evtorig[index])
 
-        print(f"Opening file {self.l_histomass[index]}", flush=True)
         with TFile.Open(self.l_histomass[index], "recreate") as _:
             dfevtorig = read_df(self.l_evtorig[index])
             histonorm = TH1F("histonorm", "histonorm", 5, 0, 5)
@@ -260,9 +259,8 @@ def process_histomass_single(self, index):
             if self.l_wgt:
                 self.logger.info("Filling event weights")
                 dfwgt = read_df(self.l_wgt[index])
-                print(dfwgt.info())
                 histonorm.SetBinContent(5, dfwgt["fEventWeight"].sum())
-            else:
+            elif self.datatype == "fd":
                 self.logger.warning("No event weights found, empty list: %s", self.l_wgt)
             get_axis(histonorm, 0).SetBinLabel(1, "N_{evt}")
             get_axis(histonorm, 0).SetBinLabel(2, "N_{coll}")
@@ -504,10 +502,6 @@ def process_efficiency_single(self, index):
             }
 
             for cat in cats:
-                print(
-                    f"Filling histograms for {cat}: {dfgen[cat].info()}, {dfdet[cat].info()}, {dfmatch[cat].info()}",
-                    flush=True,
-                )
                 fill_hist(h_eff[(cat, "gen")], dfgen[cat][["fJetPt_gen", "fPt_gen"]])
                 fill_hist(h_eff[(cat, "det")], dfdet[cat][["fJetPt", "fPt"]])
                 if cat in dfmatch and dfmatch[cat] is not None:

From 24e9846e3a29333f80ba119005e52e1f416733cd Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Mon, 28 Apr 2025 17:16:36 +0200
Subject: [PATCH 19/20] Fix formatting

---
 machine_learning_hep/multiprocesser.py | 16 ++++++--
 machine_learning_hep/processer.py      | 56 +++++++++++++++-----------
 2 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/machine_learning_hep/multiprocesser.py b/machine_learning_hep/multiprocesser.py
index 063397ce08..76d8ce7bf3 100755
--- a/machine_learning_hep/multiprocesser.py
+++ b/machine_learning_hep/multiprocesser.py
@@ -16,16 +16,18 @@
 main script for doing data processing, machine learning and analysis
 """
 
-from functools import reduce
 import os
 import tempfile
+from functools import reduce
 from typing import TypeVar
 
 from machine_learning_hep.io_ml_utils import dump_yaml_from_dict, parse_yaml
 from machine_learning_hep.logger import get_logger
 from machine_learning_hep.utilities import merge_method, mergerootfiles
+
 from .common import DataType
 
+
 class MultiProcesser:  # pylint: disable=too-many-instance-attributes, too-many-statements, consider-using-f-string, too-many-branches
     species = "multiprocesser"
     logger = get_logger()
@@ -111,8 +113,12 @@ def __init__(self, case, proc_class, datap, typean, run_param, datatype):
         self.lper_evtorig = [os.path.join(direc, self.n_evtorig) for direc in self.dlper_pkl]
 
         dp = self.cfg(f"mlapplication.{self.datatype.value}", {})
-        self.dlper_reco_modapp = [self.d_prefix_app + p for p in dp["pkl_skimmed_dec"]] if dp else [None] * len(self.p_period)
-        self.dlper_reco_modappmerged = [self.d_prefix_app + p for p in dp["pkl_skimmed_decmerged"]] if dp else [None] * len(self.p_period)
+        self.dlper_reco_modapp = (
+            [self.d_prefix_app + p for p in dp["pkl_skimmed_dec"]] if dp else [None] * len(self.p_period)
+        )
+        self.dlper_reco_modappmerged = (
+            [self.d_prefix_app + p for p in dp["pkl_skimmed_decmerged"]] if dp else [None] * len(self.p_period)
+        )
 
         dp = self.cfg(f"analysis.{self.typean}.{self.datatype.value}", {})
         self.d_results = [self.d_prefix_res + os.path.expandvars(p) for p in dp["results"]]
@@ -121,7 +127,9 @@ def __init__(self, case, proc_class, datap, typean, run_param, datatype):
         self.f_evt_mergedallp = os.path.join(self.d_pklevt_mergedallp, self.n_evt)
         self.f_evtorig_mergedallp = os.path.join(self.d_pklevt_mergedallp, self.n_evtorig)
 
-        self.lper_runlistrigger = self.cfg(f"analysis.{self.typean}.{self.datatype.value}.runselection", [None] * len(self.p_period))
+        self.lper_runlistrigger = self.cfg(
+            f"analysis.{self.typean}.{self.datatype.value}.runselection", [None] * len(self.p_period)
+        )
 
         self.lper_mcreweights = None
         if self.datatype == DataType.MC:
diff --git a/machine_learning_hep/processer.py b/machine_learning_hep/processer.py
index 280ced430e..55e25c4355 100644
--- a/machine_learning_hep/processer.py
+++ b/machine_learning_hep/processer.py
@@ -219,18 +219,18 @@ def __init__(
         # Potentially mask certain values (e.g. nsigma TOF of -999)
         self.p_mask_values = datap["ml"].get("mask_values", None)
 
-        self.bins_skimming = np.array(list(zip(self.lpt_anbinmin, self.lpt_anbinmax, strict=False)), "d")
-        self.bins_analysis = np.array(list(zip(self.lpt_finbinmin, self.lpt_finbinmax, strict=False)), "d")
+        self.bins_skimming = np.array(list(zip(self.lpt_anbinmin, self.lpt_anbinmax, strict=True)), "d")
+        self.bins_analysis = np.array(list(zip(self.lpt_finbinmin, self.lpt_finbinmax, strict=True)), "d")
         bin_matching = [
             [ptrange[0] <= bin[0] and ptrange[1] >= bin[1] for ptrange in self.bins_skimming].index(True)
             for bin in self.bins_analysis
         ]
 
         self.lpt_probcutpre = self.cfg_global(f"mlapplication.probcutpresel.{self.datatype}", [None] * self.p_nptbins)
-        lpt_probcutfin_tmp = self.cfg_global(f"mlapplication.probcutoptimal", [None] * self.p_nptfinbins)
+        lpt_probcutfin_tmp = self.cfg_global("mlapplication.probcutoptimal", [None] * self.p_nptfinbins)
         self.lpt_probcutfin = [lpt_probcutfin_tmp[bin_matching[ibin]] for ibin in range(self.p_nptfinbins)]
 
-        if self.datatype in ('mc', 'data'):
+        if self.datatype in ("mc", "data"):
             for ibin, probcutfin in enumerate(self.lpt_probcutfin):
                 probcutpre = self.lpt_probcutpre[bin_matching[ibin]]
                 if self.mltype == "MultiClassification":
@@ -254,7 +254,7 @@ def __init__(
                 for ipt in range(self.p_nptfinbins):
                     mlsel_multi = [
                         f"y_test_prob{self.p_modelname}{label.replace('-', '_')} {comp} {probcut}"
-                        for label, comp, probcut in zip(self.class_labels, comps, self.lpt_probcutfin[ipt], strict=False)
+                        for label, comp, probcut in zip(self.class_labels, comps, self.lpt_probcutfin[ipt], strict=True)
                 ]
 
         self.d_pkl_dec = d_pkl_dec
@@ -291,7 +291,7 @@ def __init__(
         )
 
         self.lpt_recodec = None
-        if self.doml and self.datatype in ('mc', 'data'):
+        if self.doml and self.datatype in ("mc", "data"):
             if self.mltype == "MultiClassification":
                 self.lpt_recodec = [
                     self.n_reco.replace(
@@ -320,22 +320,30 @@ def __init__(
                 for i in range(self.p_nptbins)
             ]
 
-        self.mptfiles_recosk = [
-            createlist(self.d_pklsk, self.l_path, self.lpt_recosk[ipt]) for ipt in range(self.p_nptbins)
-        ] if self.datatype in ('mc', 'data') else []
-        self.mptfiles_recoskmldec = [
-            createlist(self.d_pkl_dec, self.l_path, self.lpt_recodec[ipt]) for ipt in range(self.p_nptbins)
-        ] if self.datatype in ('mc', 'data') else []
-        self.lpt_recodecmerged = [
-            os.path.join(self.d_pkl_decmerged, self.lpt_recodec[ipt]) for ipt in range(self.p_nptbins)
-        ] if self.datatype in ('mc', 'data') else []
-        if self.datatype in ('mc', 'fd'):
+        self.mptfiles_recosk = (
+            [createlist(self.d_pklsk, self.l_path, self.lpt_recosk[ipt]) for ipt in range(self.p_nptbins)]
+            if self.datatype in ("mc", "data")
+            else []
+        )
+        self.mptfiles_recoskmldec = (
+            [createlist(self.d_pkl_dec, self.l_path, self.lpt_recodec[ipt]) for ipt in range(self.p_nptbins)]
+            if self.datatype in ("mc", "data")
+            else []
+        )
+        self.lpt_recodecmerged = (
+            [os.path.join(self.d_pkl_decmerged, self.lpt_recodec[ipt]) for ipt in range(self.p_nptbins)]
+            if self.datatype in ("mc", "data")
+            else []
+        )
+        if self.datatype in ("mc", "fd"):
             self.mptfiles_gensk = [
                 createlist(self.d_pklsk, self.l_path, self.lpt_gensk[ipt]) for ipt in range(self.p_nptbins)
             ]
-            self.lpt_gendecmerged = [
-                os.path.join(self.d_pkl_decmerged, self.lpt_gensk[ipt]) for ipt in range(self.p_nptbins)
-            ] if self.d_pkl_decmerged else []
+            self.lpt_gendecmerged = (
+                [os.path.join(self.d_pkl_decmerged, self.lpt_gensk[ipt]) for ipt in range(self.p_nptbins)]
+                if self.d_pkl_decmerged
+                else []
+            )
             self.mptfiles_gensk_sl = (
                 [createlist(self.d_pklsk, self.l_path, self.lpt_gensk_sl[ipt]) for ipt in range(self.p_nptbins)]
                 if self.lpt_gensk_sl
@@ -378,10 +386,10 @@ def dfread(rdir, trees, cols, idx_name=None):
                     trees = [trees]
                     cols = [cols]
                 # if all(type(var) is str for var in vars): vars = [vars]
-                if not all((name in rdir for name in trees)):
+                if not all(name in rdir for name in trees):
                     self.logger.critical("Missing trees: %s", trees)
                 df = None
-                for tree, col in zip([rdir[name] for name in trees], cols, strict=False):
+                for tree, col in zip([rdir[name] for name in trees], cols, strict=True):
                     try:
                         data = tree.arrays(expressions=col, library="np")
                         dfnew = pd.DataFrame(columns=col, data=data)
@@ -448,7 +456,7 @@ def dfuse(df_spec):
                     if dfuse(df_spec):
                         trees = []
                         cols = []
-                        for tree, spec in zip(df_spec["trees"].keys(), df_spec["trees"].values(), strict=False):
+                        for tree, spec in zip(df_spec["trees"].keys(), df_spec["trees"].values(), strict=True):
                             if isinstance(spec, list):
                                 trees.append(tree)
                                 cols.append(spec)
@@ -547,8 +555,8 @@ def dfuse(df_spec):
 
     def skim(self, file_index):
         dfreco = read_df(self.l_reco[file_index]) if self.datatype != "fd" else None
-        dfgen = read_df(self.l_gen[file_index]) if self.datatype in ('mc', 'fd') else None
-        dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.n_gen_sl and self.datatype in ('mc', 'fd') else None
+        dfgen = read_df(self.l_gen[file_index]) if self.datatype in ("mc", "fd") else None
+        dfgen_sl = read_df(self.l_gen_sl[file_index]) if self.n_gen_sl and self.datatype in ("mc", "fd") else None
 
         for ipt in range(self.p_nptbins):
             if dfreco is not None:

From f2e87686409c655701cbdb5530b85077d2e15e02 Mon Sep 17 00:00:00 2001
From: Jochen Klein <jochen.klein@cern.ch>
Date: Tue, 12 Aug 2025 11:22:34 +0200
Subject: [PATCH 20/20] Update POWHEG scaling

---
 machine_learning_hep/analysis/analyzer_jets.py      | 13 +++----------
 .../data_run3/database_ml_parameters_D0Jet_pp.yml   |  1 +
 machine_learning_hep/processer_jet.py               |  8 +++++---
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py
index 4022b567e8..7ace5632a8 100644
--- a/machine_learning_hep/analysis/analyzer_jets.py
+++ b/machine_learning_hep/analysis/analyzer_jets.py
@@ -1108,9 +1108,6 @@ def estimate_feeddown(self):
                         bins_obs[var],
                     )
                     fill_hist_fast(h3_fd_gen_orig[var], df[["pt_jet", "pt_cand", f"{colname}"]])
-                    self._save_hist(
-                        project_hist(h3_fd_gen_orig[var], [0, 2], {}), f"fd/h_ptjet-{var}_feeddown_gen_noeffscaling.png"
-                    )
 
             case "sim":
                 h3_fd_gen_orig = {}
@@ -1120,15 +1117,11 @@ def estimate_feeddown(self):
                         label = f"-{var}" if var else ""
                         if fh := rfile.Get(f"h_mass-ptjet-pthf{label}"):
                             h3_fd_gen_orig[var] = project_hist(fh, list(range(1, get_dim(fh))), {})
-                            ensure_sumw2(h3_fd_gen_orig[var])
-                            self._save_hist(
-                                project_hist(h3_fd_gen_orig[var], [0, 2], {}),
-                                f"fd/h_ptjet-{var}_feeddown_genonly_noeffscaling.png",
-                            )
                     h_norm = rfile.Get("histonorm")
-                    powheg_xsection_scale_factor = h_norm.GetBinContent(5)
+                    powheg_xsection_avg = h_norm.GetBinContent(6) / h_norm.GetBinContent(5)
+                    powheg_xsection_scale_factor = powheg_xsection_avg / h_norm.GetBinContent(5)
                     self.logger.info("powheg_xsection_scale_factor = %f", powheg_xsection_scale_factor)
-                    self.logger.info("POWHEG luminosity (mb^{-1}): %g", 1.0 / powheg_xsection_scale_factor)
+                    self.logger.info("POWHEG luminosity (mb^{-1}): %g", 1. / powheg_xsection_scale_factor)
 
             case fd_input:
                 self.logger.critical("Invalid feeddown input %s", fd_input)
diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
index 58df884f55..c54851067e 100644
--- a/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
+++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0Jet_pp.yml
@@ -71,6 +71,7 @@ D0Jet_pp:
         filter: "abs(fPosZ) < 10."
 
       coll_powheg_fd:
+        level: fd
         trees:
           O2d0cmcpjetmcco: [fEventWeight]
       collcnt_mc:
diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py
index 9492f7ed98..b8fee7c4ae 100644
--- a/machine_learning_hep/processer_jet.py
+++ b/machine_learning_hep/processer_jet.py
@@ -240,7 +240,7 @@ def process_histomass_single(self, index):
 
         with TFile.Open(self.l_histomass[index], "recreate") as _:
             dfevtorig = read_df(self.l_evtorig[index])
-            histonorm = TH1F("histonorm", "histonorm", 5, 0, 5)
+            histonorm = TH1F("histonorm", "histonorm", 6, 0, 6)
             histonorm.SetBinContent(1, len(dfquery(dfevtorig, self.s_evtsel)))
             if self.l_collcnt:
                 dfcollcnt = read_df(self.l_collcnt[index])
@@ -259,14 +259,16 @@ def process_histomass_single(self, index):
             if self.l_wgt:
                 self.logger.info("Filling event weights")
                 dfwgt = read_df(self.l_wgt[index])
-                histonorm.SetBinContent(5, dfwgt["fEventWeight"].sum())
+                histonorm.SetBinContent(5, len(dfwgt["fEventWeight"]))
+                histonorm.SetBinContent(6, dfwgt["fEventWeight"].sum())
             elif self.datatype == "fd":
                 self.logger.warning("No event weights found, empty list: %s", self.l_wgt)
             get_axis(histonorm, 0).SetBinLabel(1, "N_{evt}")
             get_axis(histonorm, 0).SetBinLabel(2, "N_{coll}")
             get_axis(histonorm, 0).SetBinLabel(3, "N_{coll}^{TVX}")
             get_axis(histonorm, 0).SetBinLabel(4, "N_{BC}^{TVX}")
-            get_axis(histonorm, 0).SetBinLabel(5, "w_{POWHEG}")
+            get_axis(histonorm, 0).SetBinLabel(5, "N_{POWHEG}")
+            get_axis(histonorm, 0).SetBinLabel(6, "sum_xs_{POWHEG}")
             histonorm.Write()
 
             if self.datatype != "fd":