Skip to content

Commit 61ae62b

Browse files
committed
pipeline runner: psutil Stability fixes and workarounds
It turns out that psutil.children() has not enough priviledges to determine the list of child processes on some GRID Unixes. In such cases, we now use a custom solution based on a bash function.
1 parent 6b1f33f commit 61ae62b

File tree

1 file changed

+53
-7
lines changed

1 file changed

+53
-7
lines changed

MC/bin/o2_dpg_workflow_runner.py

Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,40 @@ def setup_logger(name, log_file, level=logging.INFO):
4040
metriclogger = setup_logger('pipeline_metric_logger', 'pipeline_metric.log')
4141

4242

43+
# A fallback solution to getting all child procs
44+
# in case psutil has problems (PermissionError).
45+
# It returns the same list as psutil.children(recursive=True).
46+
def getChildProcs(basepid):
47+
cmd='''
48+
childprocs() {
49+
local parent=$1
50+
if [ ! "$2" ]; then
51+
child_pid_list=""
52+
fi
53+
if [ "$parent" ] ; then
54+
child_pid_list="$child_pid_list $parent"
55+
for childpid in $(pgrep -P ${parent}); do
56+
childprocs $childpid "nottoplevel"
57+
done;
58+
fi
59+
# return via a string list (only if toplevel)
60+
if [ ! "$2" ]; then
61+
echo "${child_pid_list}"
62+
fi
63+
}
64+
'''
65+
cmd = cmd + '\n' + 'childprocs ' + str(basepid)
66+
output = subprocess.check_output(cmd, shell=True)
67+
plist = []
68+
for p in output.strip().split():
69+
try:
70+
proc=psutil.Process(int(p))
71+
except psutil.NoSuchProcess:
72+
continue
73+
74+
plist.append(proc)
75+
return plist
76+
4377
#
4478
# Code section to find all topological orderings
4579
# of a DAG. This is used to know when we can schedule
@@ -363,20 +397,28 @@ def __init__(self, workflowfile, args, jmax=100):
363397
def SIGHandler(self, signum, frame):
364398
# basically forcing shut down of all child processes
365399
actionlogger.info("Signal " + str(signum) + " caught")
366-
procs = psutil.Process().children(recursive=True)
400+
try:
401+
procs = psutil.Process().children(recursive=True)
402+
except (psutil.NoSuchProcess):
403+
pass
404+
except (psutil.AccessDenied, PermissionError):
405+
procs = getChildProcs(os.getpid())
406+
367407
for p in procs:
368408
actionlogger.info("Terminating " + str(p))
369409
try:
370-
p.terminate()
410+
p.terminate()
371411
except (psutil.NoSuchProcess, psutil.AccessDenied):
372-
pass
412+
pass
413+
373414
gone, alive = psutil.wait_procs(procs, timeout=3)
374415
for p in alive:
375-
actionlogger.info("Killing " + str(p))
376416
try:
377-
p.kill()
417+
actionlogger.info("Killing " + str(p))
418+
p.kill()
378419
except (psutil.NoSuchProcess, psutil.AccessDenied):
379-
pass
420+
pass
421+
380422
exit (1)
381423

382424
def getallrequirements(self, t):
@@ -434,6 +476,7 @@ def submit(self, tid, nice=0):
434476
p.nice(nice)
435477
self.nicevalues[tid]=nice
436478
except (psutil.NoSuchProcess, psutil.AccessDenied):
479+
actionlogger.error('Couldn\'t set nice value of ' + str(p.pid) + ' to ' + str(nice) + ' -- current value is ' + str(p.nice()))
437480
self.nicevalues[tid]=0
438481
return p
439482

@@ -539,9 +582,12 @@ def monitor(self, process_list):
539582
psutilProcs = [ proc ]
540583
# use psutil for CPU measurement
541584
psutilProcs = psutilProcs + proc.children(recursive=True)
542-
except (psutil.NoSuchProcess, psutil.AccessDenied):
585+
except (psutil.NoSuchProcess):
543586
continue
544587

588+
except (psutil.AccessDenied, PermissionError):
589+
psutilProcs = psutilProcs + getChildProcs(pid)
590+
545591
# accumulate total metrics (CPU, memory)
546592
totalCPU = 0.
547593
totalPSS = 0.

0 commit comments

Comments
 (0)