From 02dfd45faa3a383828691f098fa8204c398fb6f7 Mon Sep 17 00:00:00 2001 From: swenzel Date: Tue, 5 Aug 2025 15:38:52 +0200 Subject: [PATCH 1/2] Improvements for grid_submit * ability to just wait for any succeeding job in a split * some cleanup/renaming --- GRID/utils/grid_submit.sh | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/GRID/utils/grid_submit.sh b/GRID/utils/grid_submit.sh index 7dddd5e80..b3d6372d2 100755 --- a/GRID/utils/grid_submit.sh +++ b/GRID/utils/grid_submit.sh @@ -216,6 +216,7 @@ while [ $# -gt 0 ] ; do --prodsplit) PRODSPLIT=$2; shift 2 ;; # allows to set JDL production split level (useful to easily replicate workflows) --singularity) SINGULARITY=ON; shift 1 ;; # run everything inside singularity --wait) WAITFORALIEN=ON; shift 1 ;; #wait for alien jobs to finish + --wait-any) WAITFORALIENANY=ON; WAITFORALIEN=ON; shift 1 ;; #wait for any good==done alien jobs to return --outputspec) OUTPUTSPEC=$2; shift 2 ;; #provide comma separate list of JDL file specs to be put as part of JDL Output field (example '"*.log@disk=1","*.root@disk=2"') -h) Usage ; exit ;; --help) Usage ; exit ;; @@ -227,6 +228,7 @@ export JOBTTL export JOBLABEL export MATTERMOSTHOOK export CONTROLSERVER + [[ $PRODSPLIT -gt 100 ]] && echo "Production split needs to be smaller than 100 for the moment" && exit 1 # check for presence of jq (needed in code path to fetch output files) @@ -270,9 +272,10 @@ pok "Set the job name by running $0 " # Generate local workdir # if [[ "${ONGRID}" == "0" ]]; then - WORKDIR=${WORKDIR:-/tmp/alien_work/$(basename "$MY_JOBWORKDIR")} - [ ! -d "${WORKDIR}" ] && mkdir -p ${WORKDIR} - [ ! "${CONTINUE_WORKDIR}" ] && cp "${MY_JOBSCRIPT}" "${WORKDIR}/alien_jobscript.sh" + GRID_SUBMIT_WORKDIR=${GRID_SUBMIT_WORKDIR:-/tmp/alien_work/$(basename "$MY_JOBWORKDIR")} + echo "WORKDIR FOR THIS JOB IS ${GRID_SUBMIT_WORKDIR}" + [ ! -d "${GRID_SUBMIT_WORKDIR}" ] && mkdir -p ${GRID_SUBMIT_WORKDIR} + [ ! "${CONTINUE_WORKDIR}" ] && cp "${MY_JOBSCRIPT}" "${GRID_SUBMIT_WORKDIR}/alien_jobscript.sh" fi # @@ -349,7 +352,7 @@ if [[ "${IS_ALIEN_JOB_SUBMITTER}" ]]; then cd "$(dirname "$0")" THIS_SCRIPT="$PWD/$(basename "$0")" - cd "${WORKDIR}" + cd "${GRID_SUBMIT_WORKDIR}" QUOT='"' # ---- Generate JDL ---------------- @@ -436,11 +439,18 @@ EOF continue fi let counter=0 # reset counter - JOBSTATUS=$(alien.py ps -j ${MY_JOBID} | awk '//{print $3}') - # echo -ne "Waiting for jobs to return; Last status ${JOBSTATUS}" + + # this is the global job status (a D here means the production is done) + JOBSTATUS=$(alien.py ps -j ${MY_JOBID} | awk '//{print $3}') # this is the global job status + # in addition we may query individual splits + if [ "${WAITFORANY}" ]; then + if ALIENPY_JSON=true alien.py ps -a -m "${MY_JOBID}" | grep "status" | grep -q "DONE"; then + JOBSTATUS="D" # a D here means == some job finished successfully + fi + fi if [ "${JOBSTATUS}" == "D" ]; then - echo "Job done" + echo "${WAITFORALIENANY:+At least one }Job(s) done" WAITFORALIEN="" # guarantees to go out of outer while loop if [ "${FETCHOUTPUT}" ]; then @@ -473,10 +483,6 @@ EOF done fi fi - if [[ "${FOO:0:1}" == [EK] ]]; then - echo "Job error occured" - exit 1 - fi done # get the job data products locally if requested @@ -490,7 +496,7 @@ if [[ ${SINGULARITY} ]]; then # if singularity was asked we restart this script within a container # it's actually much like the GRID mode --> which is why we set JALIEN_TOKEN_CERT set -x - cp $0 ${WORKDIR} + cp $0 ${GRID_SUBMIT_WORKDIR} # detect architecture (ARM or X86) ARCH=$(uname -i) @@ -508,15 +514,15 @@ if [[ ${SINGULARITY} ]]; then APPTAINER_EXEC="/cvmfs/alice.cern.ch/containers/bin/apptainer/${ARCH}/current/bin/apptainer" # we can actually analyse the local JDL to find the package and set it up for the container - ${APPTAINER_EXEC} exec -C -B /cvmfs:/cvmfs,${WORKDIR}:/workdir --pwd /workdir -C ${CONTAINER} /workdir/grid_submit.sh \ + ${APPTAINER_EXEC} exec -C -B /cvmfs:/cvmfs,${GRID_SUBMIT_WORKDIR}:/workdir --pwd /workdir -C ${CONTAINER} /workdir/grid_submit.sh \ ${CONTINUE_WORKDIR:+"-c ${CONTINUE_WORKDIR}"} --local ${O2TAG:+--o2tag ${O2TAG}} --ttl ${JOBTTL} --label ${JOBLABEL:-label} ${MATTERMOSTHOOK:+--mattermost ${MATTERMOSTHOOK}} ${CONTROLSERVER:+--controlserver ${CONTROLSERVER}} set +x exit $? fi if [[ "${ONGRID}" == 0 ]]; then - banner "Executing job in directory ${WORKDIR}" - cd "${WORKDIR}" 2> /dev/null + banner "Executing job in directory ${GRID_SUBMIT_WORKDIR}" + cd "${GRID_SUBMIT_WORKDIR}" 2> /dev/null fi exec &> >(tee -a alien_log_${ALIEN_PROC_ID:-0}.txt) From 15d2c676c4b827286de76870fba195b636964ff0 Mon Sep 17 00:00:00 2001 From: swenzel Date: Wed, 19 Mar 2025 16:43:28 +0100 Subject: [PATCH 2/2] Setup to test anchorMC in the 2tag approach Provides * Template file to generate anchored MC jobs * A driver script creating actual test jobs and executing them on the GRID * CSV data file specifying which software tags should be tested The idea is that this should be used to check if MC (daily) releases are compatible with the 2tag approach relative to a given set of async software releases. --- .../ANCHOR/tests/test_anchor_2tag_template.sh | 57 +++++++ MC/run/ANCHOR/tests/test_anchor_cases.csv | 22 +++ MC/run/ANCHOR/tests/test_looper.sh | 142 ++++++++++++++++++ 3 files changed, 221 insertions(+) create mode 100644 MC/run/ANCHOR/tests/test_anchor_2tag_template.sh create mode 100644 MC/run/ANCHOR/tests/test_anchor_cases.csv create mode 100755 MC/run/ANCHOR/tests/test_looper.sh diff --git a/MC/run/ANCHOR/tests/test_anchor_2tag_template.sh b/MC/run/ANCHOR/tests/test_anchor_2tag_template.sh new file mode 100644 index 000000000..f97e7f233 --- /dev/null +++ b/MC/run/ANCHOR/tests/test_anchor_2tag_template.sh @@ -0,0 +1,57 @@ +#!/bin/bash +#JDL_OUTPUT=*.txt@disk=1,AO2D.root@disk=2,*.log@disk=1,*stat*@disk=1,*.json@disk=1,debug*tgz@disk=2 +#JDL_ERROROUTPUT=*.txt@disk=1,AO2D.root@disk=2,*.log@disk=1,*.json@disk=1,debug*tgz@disk=2 +#JDL_PACKAGE=%{SOFTWARETAG_SIM} +#JDL_REQUIRE=%{JDL_REQUIREMENT} + +# +# A template anchoring script to test various anchoring setups +# and software combinations +# + +# only relevant if executed locally +if [ ! ${O2_ROOT} ]; then + source <(/cvmfs/alice.cern.ch/bin/alienv printenv %{SOFTWARETAG_SIM}) +fi + +# meta configuration of the job (influences reco config) +export ALIEN_JDL_LPMPRODUCTIONTYPE=MC +export ALIEN_JDL_CPULIMIT=8 + +export ALIEN_JDL_LPMANCHORPASSNAME=%{PASSNAME} +export ALIEN_JDL_MCANCHOR=%{PASSNAME} +export ALIEN_JDL_COLLISIONSYSTEM=%{COL_SYSTEM} +export ALIEN_JDL_LPMPASSNAME=%{PASSNAME} +export ALIEN_JDL_LPMRUNNUMBER=%{RUN_NUMBER} +export ALIEN_JDL_LPMANCHORRUN=%{RUN_NUMBER} + +export ALIEN_JDL_LPMINTERACTIONTYPE=%{INTERACTIONTYPE} +export ALIEN_JDL_LPMPRODUCTIONTAG=%{PRODUCTION_TAG} +export ALIEN_JDL_LPMANCHORPRODUCTION=%{ANCHOR_PRODUCTION} +export ALIEN_JDL_LPMANCHORYEAR=%{ANCHORYEAR} +export ALIEN_JDL_O2DPG_ASYNC_RECO_TAG="%{SOFTWARETAG_ASYNC}" + +# get custom O2DPG for 2tag treatment (could be used to test different O2DPG branches) +# git clone https://github.com/AliceO2Group/O2DPG O2DPG +# export O2DPG_ROOT=${PWD}/O2DPG +# export ALIEN_JDL_O2DPG_OVERWRITE=${PWD}/O2DPG + +# dimension the job +export NTIMEFRAMES=1 + +# further configuration of the job +export ALIEN_JDL_ADDTIMESERIESINMC=0 +export DISABLE_QC=1 +export ALIEN_JDL_MC_ORBITS_PER_TF=10000:10000000:2 # puts just 2 orbit for large enough interaction rates +export ALIEN_JDL_O2DPGWORKFLOWTARGET="aod" + +# select anchoring points +export PRODSPLIT=${ALIEN_O2DPG_GRIDSUBMIT_PRODSPLIT:-100} +export SPLITID=${ALIEN_O2DPG_GRIDSUBMIT_SUBJOBID:-50} +export CYCLE=0 + +# generator and other sim configuration +export ALIEN_JDL_ANCHOR_SIM_OPTIONS="%{SIM_OPTIONS}" + +# execute MC +# ${O2DPG_ROOT}/MC/run/ANCHOR/anchorMC.sh diff --git a/MC/run/ANCHOR/tests/test_anchor_cases.csv b/MC/run/ANCHOR/tests/test_anchor_cases.csv new file mode 100644 index 000000000..490b7bba5 --- /dev/null +++ b/MC/run/ANCHOR/tests/test_anchor_cases.csv @@ -0,0 +1,22 @@ +# comment (no empty lines allowed) +%{SOFTWARETAG_SIM},%{SOFTWARETAG_ASYNC},%{PASSNAME},%{COL_SYSTEM},%{RUN_NUMBER},%{INTERACTIONTYPE},%{ANCHOR_PRODUCTION},%{ANCHORYEAR},%{SIM_OPTIONS},%{PRODUCTION_TAG} +#O2sim::v20250306-1,O2PDPSuite::async-async-v1-01-08-slc9-alidist-async-v1-01-01-1,apass7,p-p,526641,pp,LHC25a9_Plus10,LHC22o,2022,-gen pythia8 +#O2sim::v20250305-1,O2PDPSuite::async-async-v1-01-12-slc9-alidist-async-v1-01-01-1,apass1,p-p,551398,pp,LHC25a7_Plus10,LHC24ag,2024,-gen pythia8 +#O2sim::v20250305-1,O2PDPSuite::async-async-v1-02-10-slc9-alidist-async-v1-02-01-1,apass1,Pb-Pb,559544,PbPb,LHC25c5b,LHC24ar,2024,-gen pythia8 -confKey 'SimCutParams.globalDensityFactor=0.9f' +#O2sim::v20250806-1,O2PDPSuite::async-async-2024-PbPb-apass1-v2-slc9-alidist-async-2024-PbPb-apass1-v2-1,apass1,Pb-Pb,559544,PbPb,LHC25c5b,LHC24ar,2024,-gen pythia8 +# 2022 +# O2PDPSuite::async-async-2022-pp-apass7-v1,apass7, | 526641 +# 2023 +O2sim::v20250806-1,O2PDPSuite::async-async-2023-PbPb-apass5-v5-slc9-alidist-async-2023-PbPb-apass5-v5-1,apass5,Pb-Pb,544091,PbPb,LHC23zzh,2023,-gen pythia8 +# 2024 +O2sim::v20250806-1,O2PDPSuite::async-async-2024-pp-apass1-v7-slc9-alidist-async-2024-pp-apass1-v7-1,apass1,p-p,553185,pp,LHC24al,2024,-gen pythia8 +O2sim::v20250806-1,O2PDPSuite::async-async-2024-ppRef-apass1-v1-slc9-alidist-async-2024-ppRef-apass1-v1-1,apass1,p-p,559348,pp,LHC24ap,2024,-gen pythia8 +O2sim::v20250806-1,O2PDPSuite::async-async-2024-pbpb-apass2-v3-slc9-alidist-async-2024-pbpb-apass2-v3-1,apass2,Pb-Pb,559545,PbPb,LHC24ar,2024,-gen pythia8 +# 2025 +O2sim::v20250806-1,O2PDPSuite::async-async-2025-pO-apass1-v2-slc9-alidist-async-2025-pO-apass1-v2-1,apass1,p-O,564251,pO,LHC25ad,2025,-gen pythia8 +O2sim::v20250806-1,O2PDPSuite::async-async-2025-OO-apass1-v2-slc9-alidist-async-2025-OO-apass1-v2-1,apass1,O-O,564356,OO,LHC25ae,2025,-gen pythia8 +O2sim::v20250806-1,O2PDPSuite::async-async-2025-NeNe-apass1-v2-slc9-alidist-async-2025-NeNe-apass1-v2-1,apass1,Ne-Ne,564468,NeNe,LHC25af,2025,-gen pythia8 +# apass2 +O2sim::v20250806-1,O2PDPSuite::async-async-2025-pO-apass2-v1-slc9-alidist-async-2025-pO-apass2-v1-1,apass2,p-O,564251,pO,LHC25ad,2025,-gen pythia8 +O2sim::v20250806-1,O2PDPSuite::async-async-2025-OO-apass2-v1-slc9-alidist-async-2025-OO-apass2-v1-1,apass2,O-O,564356,OO,LHC25ae,2025,-gen pythia8 +O2sim::v20250806-1,O2PDPSuite::async-async-2025-NeNe-apass2-v1-slc9-alidist-async-2025-NeNe-apass2-v1-1,apass2,Ne-Ne,564468,NeNe,LHC25af,2025,-gen pythia8 \ No newline at end of file diff --git a/MC/run/ANCHOR/tests/test_looper.sh b/MC/run/ANCHOR/tests/test_looper.sh new file mode 100755 index 000000000..7a243746b --- /dev/null +++ b/MC/run/ANCHOR/tests/test_looper.sh @@ -0,0 +1,142 @@ +#!/bin/bash +# loops over all test cases and executes them + +# Read the CSV file +INPUT_FILE="test_anchor_cases.csv" +TEMPLATE_FILE="test_anchor_2tag_template.sh" +OUTPUT_FILE="test_anchor_generated" + +DAILYTAGTOTEST=${1:-O2sim::v20250804-1} + +SITES_FILE="test_GRID_sites.dat" + +WORKING_DIR="${PWD}/workdir_$(date +%s)_$RANDOM" +echo "WORKING DIR ${WORKING_DIR}" +mkdir -p ${WORKING_DIR} + +INPUT_FILE_STRIPPED=${WORKING_DIR}/${INPUT_FILE}_clean + +REQUIRE_STRING="" +{ + while read -r -a values; do + if [ ! "${REQUIRE_STRING}" == "" ]; then + REQUIRE_STRING="${REQUIRE_STRING} ||" + fi + REQUIRE_STRING="${REQUIRE_STRING} (other.CE == \"${values}\")" + done +} < ${SITES_FILE} +REQUIRE_STRING="(${REQUIRE_STRING});" + +echo "REQUIRE STRING ${REQUIRE_STRING}" + +# strip comments from CSV file +grep -v '#' ${INPUT_FILE} > ${INPUT_FILE_STRIPPED} + +# Read the header line and convert it into variable names +IFS=',' read -r -a headers < "$INPUT_FILE_STRIPPED" + +# Replace placeholders in the header (e.g., %{VAR} → VAR) +for i in "${!headers[@]}"; do + headers[$i]=$(echo "${headers[$i]}" | sed -E 's/#?%\{//;s/\}//g') +done + +# Read and process each subsequent line +{ + read # Skip the header line + + count=1 # Counter for output files + datestring=$(date +"%Y%m%d_%H%M%S") + while IFS=',' read -r -a values; do + # Assign each value to its corresponding variable + for i in "${!headers[@]}"; do + declare "${headers[$i]}"="${values[$i]}" + done + + PRODUCTION_TAG="2tagtest_${datestring}_${count}" + # Example: Print assigned variables + echo "SOFTWARETAG_SIM: $SOFTWARETAG_SIM" + echo "SOFTWARETAG_ASYNC: $SOFTWARETAG_ASYNC" + echo "PASSNAME: $PASSNAME" + echo "COL_SYSTEM: $COL_SYSTEM" + echo "RUN_NUMBER: $RUN_NUMBER" + echo "INTERACTIONTYPE: $INTERACTIONTYPE" + echo "PRODUCTION_TAG: $PRODUCTION_TAG" + echo "ANCHOR_PRODUCTION: $ANCHOR_PRODUCTION" + echo "ANCHORYEAR: $ANCHORYEAR" + echo "SIM_OPTIONS: $SIM_OPTIONS" + echo "--------------------------------" + + if [ "${DAILYTAGTOTEST}" ]; then + SOFTWARETAG_SIM=${DAILYTAGTOTEST} + fi + + OUTPUT_FILE_FINAL="${WORKING_DIR}/${OUTPUT_FILE}_case${count}.sh" + + # create final test script with these values + cp "$TEMPLATE_FILE" "${OUTPUT_FILE_FINAL}" + for var in "${headers[@]}"; do + sed -i "s|%{$var}|${!var}|g" "$OUTPUT_FILE_FINAL" + done + # put the require spec + sed -i "s/%{JDL_REQUIREMENT}/${REQUIRE_STRING}/g" "$OUTPUT_FILE_FINAL" + + # we submit the test to the GRID (multiplicity of 4) + # ${WORKING_DIR}/submit_case${count}_${SOFTWARETAG_ASYNC//::/-} + echo "${O2DPG_ROOT}/GRID/utils/grid_submit.sh --prodsplit 4 --singularity --ttl 3600 --script ${OUTPUT_FILE_FINAL} --jobname "anchorTest_${count}" --wait-any --topworkdir 2tag_release_testing_${SOFTWARETAG_SIM}" > ${WORKING_DIR}/submit_case${count}.sh + # TODO: optional local execution with --local option + + ((count++)) # Increment counter for next row + done +} < "${INPUT_FILE_STRIPPED}" #Redirect file input here to avoid subshell issues + +cd ${WORKING_DIR} + +# now we submit all the jobs in the background and wait for them to return +for s in `ls submit*.sh`; do + echo "submitting ${s}" + export GRID_SUBMIT_WORKDIR="${WORKING_DIR}/${s}_workdir" + ( + bash ${s} &> log_${s} + echo "Job ${s} returned" + ) & +done + +# for for all (GRID) jobs to return +echo "Waiting for jobs to return/finish" +wait + +# verify / validate the output produced from these jobs +# The test is successfull if at least one subjob from each test +# produced the AO2D output. +echo "-- Jobs done ... validating --" + +FINAL_SUCCESS=0 +for s in `ls submit*.sh`; do + # find output path + TEST_OUTPUT_PATH="${WORKING_DIR}/${s}_workdir" # $(grep "Local working directory is" log_${s} | awk '//{print $5}') + + # get the Output path on JAlien from the JDL + ALIEN_OUTPUT_FOLDER=$(grep 'OutputDir' ${TEST_OUTPUT_PATH}/*.jdl | cut -d'"' -f2 | sed 's|/[^/]*#.*#.*$||') + + # see if there is an AO2D.root and a workflow.json in one of the jobs in that folder + AODS_FOUND=$(alien.py find ${ALIEN_OUTPUT_FOLDER} AO2D.root) + WORKFLOWS_FOUND=$(alien.py find ${ALIEN_OUTPUT_FOLDER} workflow.json) + + if [[ -z ${WORKFLOWS_FOUND} || -z ${AODS_FOUND} ]]; then + echo "❌ Missing files for case $s" + FINAL_SUCCESS=1 # mark as failure + else + echo "✅ Files found in $s" + fi +done + +if [[ ${FINAL_SUCCESS} -eq 0 ]]; then + echo "✅ All submissions have required files." +else + echo "❌ Some submissions are missing required files." +fi + +#TODO: echo "-- Cleaning up ... " +cd .. + +exit ${FINAL_SUCCESS} \ No newline at end of file