diff --git a/GRID/utils/getReproducerScript.sh b/GRID/utils/getReproducerScript.sh new file mode 100755 index 000000000..6d3ab119b --- /dev/null +++ b/GRID/utils/getReproducerScript.sh @@ -0,0 +1,80 @@ +#!/bin/bash +ALIEN_PID=$1 + +if [ ${JALIEN_TOKEN_CERT} ]; then + TOKENCERT=${JALIEN_TOKEN_CERT} + TOKENKEY=${JALIEN_TOKEN_KEY} +else + if [ -f ${TMPDIR:-/tmp}/tokencert_`id -u`.pem ]; then + TOKENCERT=${TMPDIR:-/tmp}/tokencert_`id -u`.pem; + fi + if [ -f ${TMPDIR:-/tmp}/tokenkey_`id -u`.pem ]; then + TOKENKEY=${TMPDIR:-/tmp}/tokenkey_`id -u`.pem; + fi +fi + +if [ ! ${TOKENCERT} ]; then + echo "This needs a tokencert and tokenkey file in the tmp folder" + exit 1 +fi + +SCRIPT=reproducer_script_${ALIEN_PID}.sh +# talk to MonaLisa to fetch the script provided by Costin +curl 'https://alimonitor.cern.ch/users/jobenv.jsp?pid='${ALIEN_PID} \ + -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36' \ + --insecure --cert ${TOKENCERT} --key ${TOKENKEY} -o ${SCRIPT} + +# Define the Apptainer injection block which makes sure +# that the job script is automatically executed in apptainer +INJECTION=' +export ALIEN_PID=#ALIEN_PID# +# Check if the script is running inside an Apptainer (Singularity) container +if [ -z "$APPTAINER_NAME" ] && [ -z "$SINGULARITY_NAME" ]; then + # Relaunch this script inside the container + + export WORKDIR=/tmp/foo-${ALIEN_PID} + if [ ! -d ${WORKDIR} ]; then + mkdir ${WORKDIR} + fi + + # - copy the certificate token into /tmp/ inside the container + mkdir ${WORKDIR}/tmp + cp /tmp/token*pem ${WORKDIR}/tmp + + # - copy the job script into workdir + cp $0 ${WORKDIR} + + # detect architecture (ARM or X86) + ARCH=$(uname -i) + if [ "$ARCH" == "aarch64" ] || [ "$ARCH" == "x86_64" ]; then + echo "Detected hardware architecture : $ARCH" + else + echo "Invalid architecture ${ARCH} detected. Exiting" + exit 1 + fi + if [ "$ARCH" == "aarch64" ]; then + ISAARCH64="1" + fi + + CONTAINER="/cvmfs/alice.cern.ch/containers/fs/apptainer/compat_el9-${ARCH}" + APPTAINER_EXEC="/cvmfs/alice.cern.ch/containers/bin/apptainer/${ARCH}/current/bin/apptainer" + + # we can actually analyse the local JDL to find the package and set it up for the container + ${APPTAINER_EXEC} exec -C -B /cvmfs:/cvmfs,${WORKDIR}:/workdir,${WORKDIR}/tmp:/tmp --pwd /workdir -C ${CONTAINER} "$0" + exit $? +fi +' + +# Inject the block after the first line (shebang) +awk -v block="$INJECTION" 'NR==1 {print; print block; next} 1' "$SCRIPT" > tmpfile && mv tmpfile "$SCRIPT" + +# take out sandboxing structure +sed -i "/echo \"Create a fresh sandbox at every attempt of running the job: alien-job-$ALIEN_PID\"/d" "$SCRIPT" +sed -i "/rm -rf alien-job-$ALIEN_PID/d" "$SCRIPT" +sed -i "/mkdir -p alien-job-$ALIEN_PID\/tmp/d" "$SCRIPT" +sed -i "/cd alien-job-$ALIEN_PID/d" "$SCRIPT" + +# replace the PID +sed -i "s/#ALIEN_PID#/${ALIEN_PID}/g" "$SCRIPT" + +chmod +x "${SCRIPT}" diff --git a/GRID/utils/grid_submit.sh b/GRID/utils/grid_submit.sh index ba4b2a843..a2ef4c125 100755 --- a/GRID/utils/grid_submit.sh +++ b/GRID/utils/grid_submit.sh @@ -370,7 +370,7 @@ EOF [ $ERROROUTPUTSPEC ] && echo "OutputErrorE = {"${ERROROUTPUTSPEC}"};" >> "${MY_JOBNAMEDATE}.jdl" # add error output files [ $IMAGESPEC ] && echo "DebugTag = {\"${IMAGESPEC}\"};" >> "${MY_JOBNAMEDATE}.jdl" # use special singularity image to run job # echo "Requirements = {"${REQUIREMENTSSPEC}"} >> "${MY_JOBNAMEDATE}.jdl" - [ $REQUIRESPEC ] && echo "Requirements = ${REQUIRESPEC}" >> "${MY_JOBNAMEDATE}.jdl" + [ "$REQUIRESPEC" ] && echo "Requirements = ${REQUIRESPEC}" >> "${MY_JOBNAMEDATE}.jdl" # "output_arch.zip:output/*@disk=2", # "checkpoint*.tar@disk=2" @@ -383,6 +383,8 @@ EOF ( # assemble all GRID interaction in a single script / transaction [ -f "${command_file}" ] && rm ${command_file} + echo "user ${MY_USER}" >> ${command_file} + echo "whoami" >> ${command_file} [ ! "${CONTINUE_WORKDIR}" ] && echo "rmdir ${MY_JOBWORKDIR}" >> ${command_file} # remove existing job dir # echo "mkdir ${MY_BINDIR}" >> ${command_file} # create bindir echo "mkdir ${MY_JOBPREFIX}" >> ${command_file} # create job output prefix @@ -434,7 +436,7 @@ EOF continue fi let counter=0 # reset counter - JOBSTATUS=$(alien.py ps -j ${MY_JOBID} | awk '//{print $4}') + JOBSTATUS=$(alien.py ps -j ${MY_JOBID} | awk '//{print $3}') # echo -ne "Waiting for jobs to return; Last status ${JOBSTATUS}" if [ "${JOBSTATUS}" == "D" ]; then @@ -489,7 +491,24 @@ if [[ ${SINGULARITY} ]]; then # it's actually much like the GRID mode --> which is why we set JALIEN_TOKEN_CERT set -x cp $0 ${WORKDIR} - singularity exec -C -B /cvmfs:/cvmfs,${WORKDIR}:/workdir --env JALIEN_TOKEN_CERT="foo" --pwd /workdir /cvmfs/alice.cern.ch/containers/fs/singularity/centos7 $0 \ + + # detect architecture (ARM or X86) + ARCH=$(uname -i) + if [ "$ARCH" == "aarch64" ] || [ "$ARCH" == "x86_64" ]; then + echo "Detected hardware architecture : $ARCH" + else + echo "Invalid architecture ${ARCH} detected. Exiting" + exit 1 + fi + if [ "$ARCH" == "aarch64" ]; then + ISAARCH64="1" + fi + + CONTAINER="/cvmfs/alice.cern.ch/containers/fs/apptainer/compat_el9-${ARCH}" + APPTAINER_EXEC="/cvmfs/alice.cern.ch/containers/bin/apptainer/${ARCH}/current/bin/apptainer" + + # we can actually analyse the local JDL to find the package and set it up for the container + ${APPTAINER_EXEC} exec -C -B /cvmfs:/cvmfs,${WORKDIR}:/workdir --pwd /workdir -C ${CONTAINER} /workdir/grid_submit.sh \ ${CONTINUE_WORKDIR:+"-c ${CONTINUE_WORKDIR}"} --local ${O2TAG:+--o2tag ${O2TAG}} --ttl ${JOBTTL} --label ${JOBLABEL:-label} ${MATTERMOSTHOOK:+--mattermost ${MATTERMOSTHOOK}} ${CONTROLSERVER:+--controlserver ${CONTROLSERVER}} set +x exit $? @@ -515,7 +534,6 @@ banner "Limits" ulimit -a banner "OS detection" -lsb_release -a || true cat /etc/os-release || true cat /etc/redhat-release || true diff --git a/MC/run/ANCHOR/anchorMC.sh b/MC/run/ANCHOR/anchorMC.sh index e8170746a..20b210509 100755 --- a/MC/run/ANCHOR/anchorMC.sh +++ b/MC/run/ANCHOR/anchorMC.sh @@ -144,6 +144,13 @@ fi [ -z "${CYCLE}" ] && { echo_error "Set CYCLE" ; exit 1 ; } [ -z "${PRODSPLIT}" ] && { echo_error "Set PRODSPLIT" ; exit 1 ; } + +# this generates an exact reproducer script for this job +# that can be used locally for debugging etc. +if [[ -n "${ALIEN_PROC_ID}" && -n "${JALIEN_WSPORT}" ]]; then + ${O2DPG_ROOT}/GRID/utils/getReproducerScript.sh ${ALIEN_PROC_ID} +fi + # also for this keep a real default NWORKERS=${NWORKERS:-8} # set a default seed if not given @@ -370,7 +377,7 @@ fi # full logs tar-ed for output, regardless the error code or validation - to catch also QC logs... # if [[ -n "$ALIEN_PROC_ID" ]]; then - find ./ \( -name "*.log*" -o -name "*mergerlog*" -o -name "*serverlog*" -o -name "*workerlog*" -o -name "pythia8.cfg" \) | tar -czvf debug_log_archive.tgz -T - + find ./ \( -name "*.log*" -o -name "*mergerlog*" -o -name "*serverlog*" -o -name "*workerlog*" -o -name "pythia8.cfg" -o -name "reproducer*.sh" \) | tar -czvf debug_log_archive.tgz -T - if [[ "$ALIEN_JDL_CREATE_TAR_IN_MC" == "1" ]]; then find ./ \( -name "*.log*" -o -name "*mergerlog*" -o -name "*serverlog*" -o -name "*workerlog*" -o -name "*.root" \) | tar -czvf debug_full_archive.tgz -T - fi