From 5021ba67705c0281ed4b40eae8d290cc0ef43442 Mon Sep 17 00:00:00 2001 From: rbyche <133010095+rbyche@users.noreply.github.com> Date: Thu, 7 Sep 2023 17:10:31 +0900 Subject: [PATCH] Barrier for Horovod KV Store API --- python/mxnet/kvstore/horovod.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/python/mxnet/kvstore/horovod.py b/python/mxnet/kvstore/horovod.py index 20a0cd89edaa..104493d91b03 100644 --- a/python/mxnet/kvstore/horovod.py +++ b/python/mxnet/kvstore/horovod.py @@ -19,6 +19,7 @@ """ Key value store interface of MXNet for Horovod """ from __future__ import absolute_import from .base import KVStoreBase +from ..ndarray import empty __all__ = ['Horovod'] @@ -159,3 +160,16 @@ def local_rank(self): def num_workers(self): import horovod.mxnet as hvd return hvd.size() + + def _barrier(self): + """Invokes global barrier among all worker nodes. + + For example, assume there are `n` machines. We would like machine `0` to first + `init` the values and then have all the workers `pull` the initialized value. + Before pulling, we can place invoke `_barrier()` to guarantee that the + initialization is finished. + """ + import horovod.mxnet as hvd + request = empty(0) + hvd.allreduce_(request, name='_barrier') + request.wait_to_read()