From 9f72cc36bdc5a9a4e4c2ad3e582dc666c9001010 Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Thu, 20 Nov 2025 22:03:04 -0500 Subject: [PATCH 01/17] wip: reduce_sum --- make/compiler_flags | 2 +- stan/math/rev/functor/map_rect_concurrent.hpp | 48 +++-- stan/math/rev/functor/reduce_sum.hpp | 168 +++++++++++++++++- 3 files changed, 203 insertions(+), 15 deletions(-) diff --git a/make/compiler_flags b/make/compiler_flags index 177bebca39b..0febc5342de 100644 --- a/make/compiler_flags +++ b/make/compiler_flags @@ -342,7 +342,7 @@ endif # Sets up CXXFLAGS_THREADS to use threading ifdef STAN_THREADS - CXXFLAGS_THREADS ?= -DSTAN_THREADS + CXXFLAGS_THREADS ?= -DSTAN_THREADS -pthread endif ################################################################################ diff --git a/stan/math/rev/functor/map_rect_concurrent.hpp b/stan/math/rev/functor/map_rect_concurrent.hpp index 878261056fb..4e32543adaf 100644 --- a/stan/math/rev/functor/map_rect_concurrent.hpp +++ b/stan/math/rev/functor/map_rect_concurrent.hpp @@ -12,6 +12,8 @@ #include #include +#include +#include #include namespace stan { @@ -46,18 +48,40 @@ map_rect_concurrent( }; #ifdef STAN_THREADS - // we must use task isolation as described here: - // https://software.intel.com/content/www/us/en/develop/documentation/tbb-documentation/top/intel-threading-building-blocks-developer-guide/task-isolation.html - // this is to ensure that the thread local AD tape ressource is - // not being modified from a different task which may happen - // whenever this function is being used itself in a parallel - // context (like running multiple chains for Stan) - tbb::this_task_arena::isolate([&] { - tbb::parallel_for(tbb::blocked_range(0, num_jobs), - [&](const tbb::blocked_range& r) { - execute_chunk(r.begin(), r.end()); - }); - }); + std::cout << "********************************************************************************" << std::endl; + if (num_jobs > 1) { + // simple chunked threading over [0, num_jobs) + unsigned hw_threads = std::thread::hardware_concurrency(); + if (hw_threads == 0) { + hw_threads = 2; // arbitrary but > 0 + } + + const unsigned max_threads + = static_cast(std::min(hw_threads, num_jobs)); + std::cout << "max_threads = " << max_threads << std::endl; + std::vector threads; + threads.reserve(max_threads); + + const std::size_t chunk + = (num_jobs + max_threads - 1) / max_threads; // ceil + + for (unsigned t = 0; t < max_threads; ++t) { + const std::size_t start = t * chunk; + if (start >= num_jobs) break; + const std::size_t end + = std::min(start + chunk, num_jobs); + + threads.emplace_back([&, start, end] { + execute_chunk(start, end); + }); + } + + for (auto& th : threads) { + th.join(); + } + } else { + execute_chunk(0, num_jobs); + } #else execute_chunk(0, num_jobs); #endif diff --git a/stan/math/rev/functor/reduce_sum.hpp b/stan/math/rev/functor/reduce_sum.hpp index 25cbef1e073..9878acb1307 100644 --- a/stan/math/rev/functor/reduce_sum.hpp +++ b/stan/math/rev/functor/reduce_sum.hpp @@ -5,6 +5,8 @@ #include #include +#include + #include #include #include @@ -74,13 +76,79 @@ struct reduce_sum_impl, ReturnType, * to zero since the newly created reducer is used to accumulate * an independent partial sum. */ + /* recursive_reducer(recursive_reducer& other, tbb::split) : num_vars_per_term_(other.num_vars_per_term_), num_vars_shared_terms_(other.num_vars_shared_terms_), sliced_partials_(other.sliced_partials_), vmapped_(other.vmapped_), args_tuple_(other.args_tuple_) {} + */ + +inline void operator()(std::size_t begin, std::size_t end) { + if (begin == end) { + return; + } + + if (args_adjoints_.size() == 0) { + args_adjoints_ = Eigen::VectorXd::Zero(num_vars_shared_terms_); + } + // local copy of shared arguments in a local stack + if (!local_args_tuple_scope_.args_tuple_holder_) { + local_args_tuple_scope_.stack_.execute([&]() { + math::apply( + [&](auto&&... args) { + local_args_tuple_scope_.args_tuple_holder_ = + std::make_unique( + deep_copy_vars(args)...); + }, + args_tuple_); + }); + } else { + // set adjoints of shared arguments to zero + local_args_tuple_scope_.stack_.execute([] { set_zero_all_adjoints(); }); + } + + auto& args_tuple_local = *(local_args_tuple_scope_.args_tuple_holder_); + + // Initialize nested autodiff stack + const nested_rev_autodiff begin_nest; + + // Create nested autodiff copies of sliced argument that do not point + // back to main autodiff stack + std::decay_t local_sub_slice; + local_sub_slice.reserve(end - begin); + for (std::size_t i = begin; i < end; ++i) { + local_sub_slice.emplace_back(deep_copy_vars(vmapped_[i])); + } + + // Perform calculation + var sub_sum_v = math::apply( + [&](auto&&... args) { + return ReduceFunction()(local_sub_slice, begin, end - 1, &msgs_, + args...); + }, + args_tuple_local); + + // Compute Jacobian + sub_sum_v.grad(); + + // accumulate value + sum_ += sub_sum_v.val(); + + // accumulate adjoints of sliced_arguments + accumulate_adjoints(sliced_partials_ + begin * num_vars_per_term_, + std::move(local_sub_slice)); + + // accumulate adjoints of shared_arguments + math::apply( + [&](auto&&... args) { + accumulate_adjoints(args_adjoints_.data(), args...); + }, + args_tuple_local); +} + /** * Compute, using nested autodiff, the value and Jacobian of * `ReduceFunction` called over the range defined by r and accumulate those @@ -94,7 +162,7 @@ struct reduce_sum_impl, ReturnType, * * @param r Range over which to compute reduce_sum */ - inline void operator()(const tbb::blocked_range& r) { + /* inline void operator()(const tbb::blocked_range& r) { if (r.empty()) { return; } @@ -163,7 +231,7 @@ struct reduce_sum_impl, ReturnType, }, args_tuple_local); } - + */ /** * Join reducers. Accumuluate the value (sum_) and Jacobian (arg_adoints_) * of the other reducer. @@ -221,6 +289,101 @@ struct reduce_sum_impl, ReturnType, * @param args Shared arguments used in every sum term * @return Summation of all terms */ + inline var operator()(Vec&& vmapped, bool /*auto_partitioning*/, int /*grainsize*/, + std::ostream* msgs, Args&&... args) const { + if (vmapped.empty()) { + return var(0.0); + } + + const std::size_t num_terms = vmapped.size(); + const std::size_t num_vars_per_term = count_vars(vmapped[0]); + const std::size_t num_vars_sliced_terms = num_terms * num_vars_per_term; + const std::size_t num_vars_shared_terms = count_vars(args...); + + vari** varis + = ChainableStack::instance_->memalloc_.alloc_array( + num_vars_sliced_terms + num_vars_shared_terms); + double* partials + = ChainableStack::instance_->memalloc_.alloc_array( + num_vars_sliced_terms + num_vars_shared_terms); + + save_varis(varis, vmapped); + save_varis(varis + num_vars_sliced_terms, args...); + + for (std::size_t i = 0; i < num_vars_sliced_terms; ++i) { + partials[i] = 0.0; + } + + // --- simple std::thread parallelism --- + + // how many threads to use + const unsigned hw = std::thread::hardware_concurrency(); + const std::size_t max_threads = hw == 0 ? 2 : hw; + const std::size_t num_threads = std::min(max_threads, num_terms); + + // each thread gets its own reducer, but they all share the same partials buffer + // (sliced_partials_) and write to disjoint regions + std::vector> workers; + workers.reserve(num_threads); + + std::vector threads; + threads.reserve(num_threads); + + std::size_t block_begin = 0; + for (std::size_t t = 0; t < num_threads; ++t) { + std::size_t block_end + = (t + 1 == num_threads) + ? num_terms + : (num_terms * (t + 1)) / num_threads; + + // construct reducer for this thread + workers.emplace_back(std::make_unique( + num_vars_per_term, num_vars_shared_terms, partials, + vmapped, args...)); + + auto* wptr = workers.back().get(); + + threads.emplace_back([wptr, block_begin, block_end]() { + // each worker thread needs its own AD tape + static thread_local ChainableStack ad_tape; + wptr->operator()(block_begin, block_end); + }); + + block_begin = block_end; + } + + for (auto& th : threads) { + th.join(); + } + + // aggregate results + double total_sum = 0.0; + Eigen::VectorXd shared_adjoints + = Eigen::VectorXd::Zero(num_vars_shared_terms); + std::stringstream all_msgs; + + for (auto& w : workers) { + total_sum += w->sum_; + if (w->args_adjoints_.size() != 0) { + shared_adjoints += w->args_adjoints_; + } + all_msgs << w->msgs_.str(); + } + + for (std::size_t i = 0; i < num_vars_shared_terms; ++i) { + partials[num_vars_sliced_terms + i] = shared_adjoints.coeff(i); + } + + if (msgs) { + *msgs << all_msgs.str(); + } + + return var( + new precomputed_gradients_vari(total_sum, + num_vars_sliced_terms + num_vars_shared_terms, + varis, partials)); +} + /* inline var operator()(Vec&& vmapped, bool auto_partitioning, int grainsize, std::ostream* msgs, Args&&... args) const { if (vmapped.empty()) { @@ -278,6 +441,7 @@ struct reduce_sum_impl, ReturnType, worker.sum_, num_vars_sliced_terms + num_vars_shared_terms, varis, partials)); } + */ }; } // namespace internal From a7a95fb71593ebe90829719755fb39a41d993e39 Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Thu, 4 Dec 2025 10:36:19 -0500 Subject: [PATCH 02/17] 10x slower --- stan/math/rev/core/simple_thread_pool.hpp | 155 ++++++++ stan/math/rev/functor/reduce_sum.hpp | 446 ++++++---------------- 2 files changed, 278 insertions(+), 323 deletions(-) create mode 100644 stan/math/rev/core/simple_thread_pool.hpp diff --git a/stan/math/rev/core/simple_thread_pool.hpp b/stan/math/rev/core/simple_thread_pool.hpp new file mode 100644 index 00000000000..84e1adf7c55 --- /dev/null +++ b/stan/math/rev/core/simple_thread_pool.hpp @@ -0,0 +1,155 @@ +#ifndef STAN_MATH_REV_CORE_SIMPLE_THREAD_POOL_HPP +#define STAN_MATH_REV_CORE_SIMPLE_THREAD_POOL_HPP + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace stan { +namespace math { + +class SimpleThreadPool { +public: + static SimpleThreadPool& instance() { + static SimpleThreadPool pool; + return pool; + } + + SimpleThreadPool(const SimpleThreadPool&) = delete; + SimpleThreadPool& operator=(const SimpleThreadPool&) = delete; + + std::size_t thread_count() const noexcept { return workers_.size(); } + + template + auto submit(F&& f, Args&&... args) + -> std::future> { + using R = std::invoke_result_t; + + auto task_ptr = std::make_shared>( + std::bind(std::forward(f), std::forward(args)...)); + + enqueue_([task_ptr] { (*task_ptr)(); }); + return task_ptr->get_future(); + } + + template + void parallel_region(std::size_t n, F&& fn) { + if (n == 0) return; + + // Avoid nested parallelism deadlocks/oversubscription. + if (in_worker_) { + fn(std::size_t{0}); + return; + } + + const std::size_t tc = thread_count(); + if (tc == 0) { + fn(std::size_t{0}); + return; + } + if (n > tc) n = tc; + + using Fn = std::decay_t; + struct Shared { + std::atomic remaining; + std::mutex m; + std::condition_variable cv; + Fn fn; + Shared(std::size_t n_, Fn&& f_) : remaining(n_), fn(std::move(f_)) {} + }; + + auto shared = std::make_shared(n, Fn(std::forward(fn))); + + for (std::size_t tid = 0; tid < n; ++tid) { + enqueue_([shared, tid] { + shared->fn(tid); + if (shared->remaining.fetch_sub(1, std::memory_order_acq_rel) == 1) { + std::lock_guard lk(shared->m); + shared->cv.notify_one(); + } + }); + } + + std::unique_lock lk(shared->m); + shared->cv.wait(lk, [&] { + return shared->remaining.load(std::memory_order_acquire) == 0; + }); + } + +private: + SimpleThreadPool() : done_(false) { + unsigned hw = std::thread::hardware_concurrency(); + if (hw == 0) hw = 2; + const unsigned num_threads = hw; + + workers_.reserve(num_threads); + for (unsigned i = 0; i < num_threads; ++i) { + workers_.emplace_back([this] { + // Per-worker AD tape (TLS) initialized once. + static thread_local ChainableStack ad_tape; + + for (;;) { + std::function task; + { + std::unique_lock lock(mtx_); + cv_.wait(lock, [&] { return done_ || !tasks_.empty(); }); + if (done_ && tasks_.empty()) return; + task = std::move(tasks_.front()); + tasks_.pop(); + } + + WorkerScope scope; // sets in_worker_ for all tasks + task(); + } + }); + } + } + + ~SimpleThreadPool() { + { + std::lock_guard lock(mtx_); + done_ = true; + } + cv_.notify_all(); + for (auto& th : workers_) { + if (th.joinable()) th.join(); + } + } + + void enqueue_(std::function task) { + { + std::lock_guard lock(mtx_); + tasks_.emplace(std::move(task)); + } + cv_.notify_one(); + } + + struct WorkerScope { + WorkerScope() : prev_(in_worker_) { in_worker_ = true; } + ~WorkerScope() { in_worker_ = prev_; } + bool prev_; + }; + + static inline thread_local bool in_worker_ = false; + + std::vector workers_; + std::queue> tasks_; + std::mutex mtx_; + std::condition_variable cv_; + bool done_; +}; + +} // namespace math +} // namespace stan + +#endif diff --git a/stan/math/rev/functor/reduce_sum.hpp b/stan/math/rev/functor/reduce_sum.hpp index 9878acb1307..73d868ae097 100644 --- a/stan/math/rev/functor/reduce_sum.hpp +++ b/stan/math/rev/functor/reduce_sum.hpp @@ -4,15 +4,14 @@ #include #include #include +#include -#include - -#include -#include -#include - -#include +#include +#include +#include #include +#include +#include #include #include @@ -30,8 +29,8 @@ namespace internal { */ template -struct reduce_sum_impl, ReturnType, - Vec, Args...> { +struct reduce_sum_impl, ReturnType, Vec, + Args...> { struct scoped_args_tuple { ScopedChainableStack stack_; using args_tuple_t @@ -41,129 +40,33 @@ struct reduce_sum_impl, ReturnType, scoped_args_tuple() : stack_(), args_tuple_holder_(nullptr) {} }; - /** - * This struct is used by the TBB to accumulate partial - * sums over consecutive ranges of the input. To distribute the workload, - * the TBB can split larger partial sums into smaller ones in which - * case the splitting copy constructor is used. It is designed to - * meet the Imperative form requirements of `tbb::parallel_reduce`. - * - * @note see link [here](https://tinyurl.com/vp7xw2t) for requirements. - */ struct recursive_reducer { - const size_t num_vars_per_term_; - const size_t num_vars_shared_terms_; // Number of vars in shared arguments - double* sliced_partials_; // Points to adjoints of the partial calculations + const std::size_t num_vars_per_term_; + const std::size_t num_vars_shared_terms_; + double* sliced_partials_; + Vec vmapped_; std::stringstream msgs_; std::tuple args_tuple_; scoped_args_tuple local_args_tuple_scope_; + double sum_{0.0}; Eigen::VectorXd args_adjoints_{0}; template - recursive_reducer(size_t num_vars_per_term, size_t num_vars_shared_terms, - double* sliced_partials, VecT&& vmapped, ArgsT&&... args) + recursive_reducer(std::size_t num_vars_per_term, + std::size_t num_vars_shared_terms, + double* sliced_partials, + VecT&& vmapped, + ArgsT&&... args) : num_vars_per_term_(num_vars_per_term), num_vars_shared_terms_(num_vars_shared_terms), sliced_partials_(sliced_partials), vmapped_(std::forward(vmapped)), args_tuple_(std::forward(args)...) {} - /* - * This is the copy operator as required for tbb::parallel_reduce - * Imperative form. This requires sum_ and arg_adjoints_ be reset - * to zero since the newly created reducer is used to accumulate - * an independent partial sum. - */ - /* - recursive_reducer(recursive_reducer& other, tbb::split) - : num_vars_per_term_(other.num_vars_per_term_), - num_vars_shared_terms_(other.num_vars_shared_terms_), - sliced_partials_(other.sliced_partials_), - vmapped_(other.vmapped_), - args_tuple_(other.args_tuple_) {} - */ - -inline void operator()(std::size_t begin, std::size_t end) { - if (begin == end) { - return; - } - - if (args_adjoints_.size() == 0) { - args_adjoints_ = Eigen::VectorXd::Zero(num_vars_shared_terms_); - } - - // local copy of shared arguments in a local stack - if (!local_args_tuple_scope_.args_tuple_holder_) { - local_args_tuple_scope_.stack_.execute([&]() { - math::apply( - [&](auto&&... args) { - local_args_tuple_scope_.args_tuple_holder_ = - std::make_unique( - deep_copy_vars(args)...); - }, - args_tuple_); - }); - } else { - // set adjoints of shared arguments to zero - local_args_tuple_scope_.stack_.execute([] { set_zero_all_adjoints(); }); - } - - auto& args_tuple_local = *(local_args_tuple_scope_.args_tuple_holder_); - - // Initialize nested autodiff stack - const nested_rev_autodiff begin_nest; - - // Create nested autodiff copies of sliced argument that do not point - // back to main autodiff stack - std::decay_t local_sub_slice; - local_sub_slice.reserve(end - begin); - for (std::size_t i = begin; i < end; ++i) { - local_sub_slice.emplace_back(deep_copy_vars(vmapped_[i])); - } - - // Perform calculation - var sub_sum_v = math::apply( - [&](auto&&... args) { - return ReduceFunction()(local_sub_slice, begin, end - 1, &msgs_, - args...); - }, - args_tuple_local); - - // Compute Jacobian - sub_sum_v.grad(); - - // accumulate value - sum_ += sub_sum_v.val(); - - // accumulate adjoints of sliced_arguments - accumulate_adjoints(sliced_partials_ + begin * num_vars_per_term_, - std::move(local_sub_slice)); - - // accumulate adjoints of shared_arguments - math::apply( - [&](auto&&... args) { - accumulate_adjoints(args_adjoints_.data(), args...); - }, - args_tuple_local); -} - - /** - * Compute, using nested autodiff, the value and Jacobian of - * `ReduceFunction` called over the range defined by r and accumulate those - * in member variable sum_ (for the value) and args_adjoints_ (for the - * Jacobian). The nested autodiff uses deep copies of the involved operands - * ensuring that no side effects are implied to the adjoints of the input - * operands which reside potentially on a autodiff tape stored in a - * different thread other than the current thread of execution. This - * function may be called multiple times per object instantiation (so the - * sum_ and args_adjoints_ must be accumulated, not just assigned). - * - * @param r Range over which to compute reduce_sum - */ - /* inline void operator()(const tbb::blocked_range& r) { - if (r.empty()) { + inline void operator()(std::size_t begin, std::size_t end) { + if (begin >= end) { return; } @@ -171,220 +74,64 @@ inline void operator()(std::size_t begin, std::size_t end) { args_adjoints_ = Eigen::VectorXd::Zero(num_vars_shared_terms_); } - // Obtain reference to a local copy of all shared arguments that do - // not point - // back to main autodiff stack - + // Obtain reference to a local copy of all shared arguments that do not + // point back to main autodiff stack. if (!local_args_tuple_scope_.args_tuple_holder_) { - // shared arguments need to be copied to reducer-specific - // scope. In this case no need for zeroing adjoints, since the - // fresh copy has all adjoints set to zero. + // First time: copy shared arguments into reducer-scoped space. local_args_tuple_scope_.stack_.execute([&]() { math::apply( [&](auto&&... args) { - local_args_tuple_scope_.args_tuple_holder_ = std::make_unique< - typename scoped_args_tuple::args_tuple_t>( - deep_copy_vars(args)...); + local_args_tuple_scope_.args_tuple_holder_ = + std::make_unique( + deep_copy_vars(args)...); }, args_tuple_); }); } else { - // set adjoints of shared arguments to zero + // Subsequent calls: reset adjoints of shared arguments. local_args_tuple_scope_.stack_.execute([] { set_zero_all_adjoints(); }); } auto& args_tuple_local = *(local_args_tuple_scope_.args_tuple_holder_); - // Initialize nested autodiff stack + // Start a nested autodiff scope for this chunk. const nested_rev_autodiff begin_nest; - // Create nested autodiff copies of sliced argument that do not point - // back to main autodiff stack + // Copy sliced terms into this nested stack. std::decay_t local_sub_slice; - local_sub_slice.reserve(r.size()); - for (size_t i = r.begin(); i < r.end(); ++i) { + local_sub_slice.reserve(end - begin); + for (std::size_t i = begin; i < end; ++i) { local_sub_slice.emplace_back(deep_copy_vars(vmapped_[i])); } - // Perform calculation + // Run user reducer over [begin, end-1] var sub_sum_v = math::apply( [&](auto&&... args) { - return ReduceFunction()(local_sub_slice, r.begin(), r.end() - 1, - &msgs_, args...); + return ReduceFunction()(local_sub_slice, begin, end - 1, &msgs_, + args...); }, args_tuple_local); - // Compute Jacobian + // Compute partial derivatives within this nested stack. sub_sum_v.grad(); - // Accumulate value of reduce_sum + // Accumulate value. sum_ += sub_sum_v.val(); - // Accumulate adjoints of sliced_arguments - accumulate_adjoints(sliced_partials_ + r.begin() * num_vars_per_term_, + // Accumulate adjoints of sliced arguments into the shared partials array. + accumulate_adjoints(sliced_partials_ + begin * num_vars_per_term_, std::move(local_sub_slice)); - // Accumulate adjoints of shared_arguments + // Accumulate adjoints of shared arguments into this reducer's buffer. math::apply( [&](auto&&... args) { accumulate_adjoints(args_adjoints_.data(), args...); }, args_tuple_local); } - */ - /** - * Join reducers. Accumuluate the value (sum_) and Jacobian (arg_adoints_) - * of the other reducer. - * - * @param rhs Another partial sum - */ - inline void join(const recursive_reducer& rhs) { - sum_ += rhs.sum_; - if (args_adjoints_.size() != 0 && rhs.args_adjoints_.size() != 0) { - args_adjoints_ += rhs.args_adjoints_; - } else if (args_adjoints_.size() == 0 && rhs.args_adjoints_.size() != 0) { - args_adjoints_ = rhs.args_adjoints_; - } - msgs_ << rhs.msgs_.str(); - } }; - /** - * Call an instance of the function `ReduceFunction` on every element - * of an input sequence and sum these terms. - * - * This specialization is parallelized using tbb and works for reverse - * mode autodiff. - * - * ReduceFunction must define an operator() with the same signature as: - * var f(Vec&& vmapped_subset, int start, int end, std::ostream* msgs, - * Args&&... args) - * - * `ReduceFunction` must be default constructible without any arguments - * - * Each call to `ReduceFunction` is responsible for computing the - * start through end (inclusive) terms of the overall sum. All args are - * passed from this function through to the `ReduceFunction` instances. - * However, only the start through end (inclusive) elements of the vmapped - * argument are passed to the `ReduceFunction` instances (as the - * `vmapped_subset` argument). - * - * This function distributes computation of the desired sum and the Jacobian - * of that sum over multiple threads by coordinating calls to `ReduceFunction` - * instances. Results are stored as precomputed varis in the autodiff tree. - * - * If auto partitioning is true, break work into pieces automatically, - * taking grainsize as a recommended work size. The partitioning is - * not deterministic nor is the order guaranteed in which partial - * sums are accumulated. Due to floating point imprecisions this will likely - * lead to slight differences in the accumulated results between - * multiple runs. If false, break work deterministically into pieces smaller - * than or equal to grainsize and accumulate all the partial sums - * in the same order. This still may not achieve bitwise reproducibility. - * - * @param vmapped Vector containing one element per term of sum - * @param auto_partitioning Work partitioning style - * @param grainsize Suggested grainsize for tbb - * @param[in, out] msgs The print stream for warning messages - * @param args Shared arguments used in every sum term - * @return Summation of all terms - */ - inline var operator()(Vec&& vmapped, bool /*auto_partitioning*/, int /*grainsize*/, - std::ostream* msgs, Args&&... args) const { - if (vmapped.empty()) { - return var(0.0); - } - - const std::size_t num_terms = vmapped.size(); - const std::size_t num_vars_per_term = count_vars(vmapped[0]); - const std::size_t num_vars_sliced_terms = num_terms * num_vars_per_term; - const std::size_t num_vars_shared_terms = count_vars(args...); - - vari** varis - = ChainableStack::instance_->memalloc_.alloc_array( - num_vars_sliced_terms + num_vars_shared_terms); - double* partials - = ChainableStack::instance_->memalloc_.alloc_array( - num_vars_sliced_terms + num_vars_shared_terms); - - save_varis(varis, vmapped); - save_varis(varis + num_vars_sliced_terms, args...); - - for (std::size_t i = 0; i < num_vars_sliced_terms; ++i) { - partials[i] = 0.0; - } - - // --- simple std::thread parallelism --- - - // how many threads to use - const unsigned hw = std::thread::hardware_concurrency(); - const std::size_t max_threads = hw == 0 ? 2 : hw; - const std::size_t num_threads = std::min(max_threads, num_terms); - - // each thread gets its own reducer, but they all share the same partials buffer - // (sliced_partials_) and write to disjoint regions - std::vector> workers; - workers.reserve(num_threads); - - std::vector threads; - threads.reserve(num_threads); - - std::size_t block_begin = 0; - for (std::size_t t = 0; t < num_threads; ++t) { - std::size_t block_end - = (t + 1 == num_threads) - ? num_terms - : (num_terms * (t + 1)) / num_threads; - - // construct reducer for this thread - workers.emplace_back(std::make_unique( - num_vars_per_term, num_vars_shared_terms, partials, - vmapped, args...)); - - auto* wptr = workers.back().get(); - - threads.emplace_back([wptr, block_begin, block_end]() { - // each worker thread needs its own AD tape - static thread_local ChainableStack ad_tape; - wptr->operator()(block_begin, block_end); - }); - - block_begin = block_end; - } - - for (auto& th : threads) { - th.join(); - } - - // aggregate results - double total_sum = 0.0; - Eigen::VectorXd shared_adjoints - = Eigen::VectorXd::Zero(num_vars_shared_terms); - std::stringstream all_msgs; - - for (auto& w : workers) { - total_sum += w->sum_; - if (w->args_adjoints_.size() != 0) { - shared_adjoints += w->args_adjoints_; - } - all_msgs << w->msgs_.str(); - } - - for (std::size_t i = 0; i < num_vars_shared_terms; ++i) { - partials[num_vars_sliced_terms + i] = shared_adjoints.coeff(i); - } - - if (msgs) { - *msgs << all_msgs.str(); - } - - return var( - new precomputed_gradients_vari(total_sum, - num_vars_sliced_terms + num_vars_shared_terms, - varis, partials)); -} - /* - inline var operator()(Vec&& vmapped, bool auto_partitioning, int grainsize, + inline var operator()(Vec&& vmapped, bool /*auto_partitioning*/, int grainsize, std::ostream* msgs, Args&&... args) const { if (vmapped.empty()) { return var(0.0); @@ -395,56 +142,109 @@ inline void operator()(std::size_t begin, std::size_t end) { const std::size_t num_vars_sliced_terms = num_terms * num_vars_per_term; const std::size_t num_vars_shared_terms = count_vars(args...); - vari** varis = ChainableStack::instance_->memalloc_.alloc_array( - num_vars_sliced_terms + num_vars_shared_terms); - double* partials = ChainableStack::instance_->memalloc_.alloc_array( - num_vars_sliced_terms + num_vars_shared_terms); + vari** varis + = ChainableStack::instance_->memalloc_.alloc_array( + num_vars_sliced_terms + num_vars_shared_terms); + double* partials + = ChainableStack::instance_->memalloc_.alloc_array( + num_vars_sliced_terms + num_vars_shared_terms); save_varis(varis, vmapped); save_varis(varis + num_vars_sliced_terms, args...); - for (size_t i = 0; i < num_vars_sliced_terms; ++i) { + for (std::size_t i = 0; i < num_vars_sliced_terms; ++i) { partials[i] = 0.0; } - recursive_reducer worker(num_vars_per_term, num_vars_shared_terms, partials, - std::forward(vmapped), - std::forward(args)...); - - // we must use task isolation as described here: - // https://software.intel.com/content/www/us/en/develop/documentation/tbb-documentation/top/intel-threading-building-blocks-developer-guide/task-isolation.html - // this is to ensure that the thread local AD tape ressource is - // not being modified from a different task which may happen - // whenever this function is being used itself in a parallel - // context (like running multiple chains for Stan) - tbb::this_task_arena::isolate([&] { - if (auto_partitioning) { - tbb::parallel_reduce( - tbb::blocked_range(0, num_terms, grainsize), worker); - } else { - tbb::simple_partitioner partitioner; - tbb::parallel_deterministic_reduce( - tbb::blocked_range(0, num_terms, grainsize), worker, - partitioner); + // Decide chunk size (grainsize) and workers. + auto& pool = stan::math::SimpleThreadPool::instance(); + std::size_t pool_threads = pool.thread_count(); + if (pool_threads == 0) pool_threads = 1; + + std::size_t gs; + if (grainsize > 0) { + gs = static_cast(grainsize); + } else { + // Heuristic: target ~8 chunks per worker. + const std::size_t target_chunks = pool_threads * 8; + gs = (num_terms + target_chunks - 1) / target_chunks; + if (gs < 1) gs = 1; + } + + // Serial cutoffs. + if (pool_threads == 1 || num_terms <= gs) { + recursive_reducer r(num_vars_per_term, num_vars_shared_terms, + partials, vmapped, args...); + r(0, num_terms); + + for (std::size_t i = 0; i < num_vars_shared_terms; ++i) { + partials[num_vars_sliced_terms + i] + = (r.args_adjoints_.size() != 0) ? r.args_adjoints_.coeff(i) : 0.0; + } + if (msgs) { + *msgs << r.msgs_.str(); + } + + return var(new precomputed_gradients_vari( + r.sum_, num_vars_sliced_terms + num_vars_shared_terms, varis, + partials)); + } + + const std::size_t num_workers = std::min(pool_threads, num_terms); + + // One reducer per worker (reused across chunks). This lets each worker reuse + // its deep-copied shared arguments and local stack. + std::vector> workers; + workers.reserve(num_workers); + for (std::size_t t = 0; t < num_workers; ++t) { + workers.emplace_back(std::make_unique( + num_vars_per_term, num_vars_shared_terms, partials, vmapped, args...)); + } + + std::atomic next{0}; + + // Dynamic scheduling: each worker pulls the next chunk. + pool.parallel_region(num_workers, [&](std::size_t tid) { + auto& w = *workers[tid]; + while (true) { + const std::size_t begin = + next.fetch_add(gs, std::memory_order_relaxed); + if (begin >= num_terms) break; + const std::size_t end = std::min(begin + gs, num_terms); + w(begin, end); } }); - for (size_t i = 0; i < num_vars_shared_terms; ++i) { - partials[num_vars_sliced_terms + i] = worker.args_adjoints_.coeff(i); + // Aggregate results. + double total_sum = 0.0; + Eigen::VectorXd shared_adjoints = + Eigen::VectorXd::Zero(num_vars_shared_terms); + std::stringstream all_msgs; + + for (auto& wptr : workers) { + auto& w = *wptr; + total_sum += w.sum_; + if (w.args_adjoints_.size() != 0) { + shared_adjoints += w.args_adjoints_; + } + all_msgs << w.msgs_.str(); + } + + for (std::size_t i = 0; i < num_vars_shared_terms; ++i) { + partials[num_vars_sliced_terms + i] = shared_adjoints.coeff(i); } if (msgs) { - *msgs << worker.msgs_.str(); + *msgs << all_msgs.str(); } return var(new precomputed_gradients_vari( - worker.sum_, num_vars_sliced_terms + num_vars_shared_terms, varis, + total_sum, num_vars_sliced_terms + num_vars_shared_terms, varis, partials)); } - */ }; -} // namespace internal +} // namespace internal } // namespace math } // namespace stan From 5e78da0df51d9b3f9aa620074f941e244cb66bc9 Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Thu, 4 Dec 2025 11:28:58 -0500 Subject: [PATCH 03/17] 1.2x --- stan/math/rev/core/team_thread_pool.hpp | 191 +++++++++++++++++++ stan/math/rev/functor/reduce_sum.hpp | 237 +++++++++++++----------- 2 files changed, 315 insertions(+), 113 deletions(-) create mode 100644 stan/math/rev/core/team_thread_pool.hpp diff --git a/stan/math/rev/core/team_thread_pool.hpp b/stan/math/rev/core/team_thread_pool.hpp new file mode 100644 index 00000000000..8aa7eb419c6 --- /dev/null +++ b/stan/math/rev/core/team_thread_pool.hpp @@ -0,0 +1,191 @@ +#ifndef STAN_MATH_REV_CORE_TEAM_THREAD_POOL_HPP +#define STAN_MATH_REV_CORE_TEAM_THREAD_POOL_HPP + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace stan { +namespace math { + +/** + * Team (epoch) thread pool for low-overhead parallel regions. + * + * - Creates (hw-1) worker threads once. + * - Caller participates with tid=0. + * - parallel_region(n, fn): runs fn(tid) for tid in [0, n). + * - Nested parallelism: if called from a worker thread, runs serial. + * + * Designed for reduce_sum/map_rect style internal parallelism. + */ +class TeamThreadPool { + public: + static TeamThreadPool& instance() { + static TeamThreadPool pool; + return pool; + } + + TeamThreadPool(const TeamThreadPool&) = delete; + TeamThreadPool& operator=(const TeamThreadPool&) = delete; + + // Number of worker threads (excluding caller) + std::size_t worker_count() const noexcept { return workers_.size(); } + + // Total participants available = worker_count + 1 (caller) + std::size_t team_size() const noexcept { return workers_.size() + 1; } + + template + void parallel_region(std::size_t n, F&& fn) { + if (n == 0) return; + + // If called from a worker, run serial to avoid nested deadlocks. + if (in_worker_) { + fn(std::size_t{0}); + return; + } + + const std::size_t max_team = team_size(); + if (max_team == 1) { + fn(std::size_t{0}); + return; + } + if (n > max_team) n = max_team; + if (n == 1) { + fn(std::size_t{0}); + return; + } + + // Stable storage for callable during this region + using Fn = std::decay_t; + Fn fn_copy = std::forward(fn); + + // Publish region + remaining_.store(n - 1, std::memory_order_release); // workers only + region_n_.store(n, std::memory_order_release); + region_ctx_.store(static_cast(&fn_copy), std::memory_order_release); + region_call_.store(&call_impl, std::memory_order_release); + + epoch_.fetch_add(1, std::memory_order_acq_rel); + + // Wake workers + { + std::lock_guard lk(wake_m_); + } + wake_cv_.notify_all(); + + // Caller participates as tid=0 + in_worker_ = true; + fn_copy(0); + in_worker_ = false; + + // Wait for workers 1..n-1 + std::unique_lock lk(done_m_); + done_cv_.wait(lk, [&] { + return remaining_.load(std::memory_order_acquire) == 0; + }); + } + + private: + using call_fn_t = void (*)(void*, std::size_t); + + template + static void call_impl(void* ctx, std::size_t tid) { + (*static_cast(ctx))(tid); + } + + TeamThreadPool() + : stop_(false), epoch_(0), region_n_(0), region_ctx_(nullptr), + region_call_(nullptr), remaining_(0) { + unsigned hw = std::thread::hardware_concurrency(); + if (hw == 0) hw = 2; + + // hw-1 worker threads; caller is +1 participant. + const unsigned num_workers = (hw > 1) ? (hw - 1) : 1; + + workers_.reserve(num_workers); + for (unsigned i = 0; i < num_workers; ++i) { + const std::size_t tid = static_cast(i + 1); // workers are 1..N + workers_.emplace_back([this, tid] { + // Per-worker AD tape initialized once + static thread_local ChainableStack ad_tape; + in_worker_ = true; + + std::size_t seen = epoch_.load(std::memory_order_acquire); + for (;;) { + // Sleep until epoch changes or stop requested + { + std::unique_lock lk(wake_m_); + wake_cv_.wait(lk, [&] { + return stop_.load(std::memory_order_acquire) + || epoch_.load(std::memory_order_acquire) != seen; + }); + } + if (stop_.load(std::memory_order_acquire)) break; + + const std::size_t e = epoch_.load(std::memory_order_acquire); + seen = e; + + const std::size_t n = region_n_.load(std::memory_order_acquire); + if (tid >= n) { + continue; // not participating this region + } + + void* ctx = region_ctx_.load(std::memory_order_acquire); + call_fn_t call = region_call_.load(std::memory_order_acquire); + if (call) { + call(ctx, tid); + } + + if (remaining_.fetch_sub(1, std::memory_order_acq_rel) == 1) { + std::lock_guard lk(done_m_); + done_cv_.notify_one(); + } + } + + in_worker_ = false; + }); + } + } + + ~TeamThreadPool() { + stop_.store(true, std::memory_order_release); + { + std::lock_guard lk(wake_m_); + } + wake_cv_.notify_all(); + for (auto& t : workers_) { + if (t.joinable()) t.join(); + } + } + + static inline thread_local bool in_worker_ = false; + + std::vector workers_; + std::atomic stop_; + + // Region publication + std::atomic epoch_; + std::atomic region_n_; + std::atomic region_ctx_; + std::atomic region_call_; + + // Worker wake + std::mutex wake_m_; + std::condition_variable wake_cv_; + + // Completion + std::atomic remaining_; + std::mutex done_m_; + std::condition_variable done_cv_; +}; + +} // namespace math +} // namespace stan + +#endif diff --git a/stan/math/rev/functor/reduce_sum.hpp b/stan/math/rev/functor/reduce_sum.hpp index 73d868ae097..a41f12334f5 100644 --- a/stan/math/rev/functor/reduce_sum.hpp +++ b/stan/math/rev/functor/reduce_sum.hpp @@ -4,14 +4,14 @@ #include #include #include -#include +#include #include -#include #include #include #include #include +#include #include #include @@ -19,113 +19,127 @@ namespace stan { namespace math { namespace internal { -/** - * Var specialization of reduce_sum_impl - * - * @tparam ReduceFunction Type of reducer function - * @tparam ReturnType Must be var - * @tparam Vec Type of sliced argument - * @tparam Args Types of shared arguments - */ template struct reduce_sum_impl, ReturnType, Vec, Args...> { struct scoped_args_tuple { ScopedChainableStack stack_; - using args_tuple_t - = std::tuple()))...>; + using args_tuple_t = + std::tuple>()))...>; std::unique_ptr args_tuple_holder_; - scoped_args_tuple() : stack_(), args_tuple_holder_(nullptr) {} }; struct recursive_reducer { + using VecRef = std::decay_t; + using args_ptrs_t = std::tuple*...>; + + // Apply a callable to tuple of pointers, dereferencing each pointer. + template + static inline decltype(auto) apply_ptr_tuple_impl( + Fn&& fn, Tuple& t, std::index_sequence) { + return std::forward(fn)((*std::get(t))...); + } + template + static inline decltype(auto) apply_ptr_tuple(Fn&& fn, + std::tuple& t) { + return apply_ptr_tuple_impl(std::forward(fn), t, + std::index_sequence_for{}); + } + const std::size_t num_vars_per_term_; const std::size_t num_vars_shared_terms_; double* sliced_partials_; - Vec vmapped_; - std::stringstream msgs_; - std::tuple args_tuple_; + const VecRef* vmapped_; + args_ptrs_t args_ptrs_; + + // msgs only if requested + std::unique_ptr msgs_; + std::ostream* msgs_out_; + scoped_args_tuple local_args_tuple_scope_; double sum_{0.0}; - Eigen::VectorXd args_adjoints_{0}; + std::vector args_adjoints_; // faster than Eigen::VectorXd + + // Reusable buffer to avoid realloc per chunk + VecRef local_sub_slice_; + std::size_t reserved_ = 0; - template recursive_reducer(std::size_t num_vars_per_term, std::size_t num_vars_shared_terms, double* sliced_partials, - VecT&& vmapped, - ArgsT&&... args) + const VecRef& vmapped, + std::ostream* msgs, + const std::decay_t&... args) : num_vars_per_term_(num_vars_per_term), num_vars_shared_terms_(num_vars_shared_terms), sliced_partials_(sliced_partials), - vmapped_(std::forward(vmapped)), - args_tuple_(std::forward(args)...) {} + vmapped_(&vmapped), + args_ptrs_(&args...), + msgs_out_(msgs) { + if (msgs_out_) msgs_ = std::make_unique(); + } inline void operator()(std::size_t begin, std::size_t end) { - if (begin >= end) { - return; - } + if (begin >= end) return; - if (args_adjoints_.size() == 0) { - args_adjoints_ = Eigen::VectorXd::Zero(num_vars_shared_terms_); + if (args_adjoints_.empty()) { + args_adjoints_.assign(num_vars_shared_terms_, 0.0); } - // Obtain reference to a local copy of all shared arguments that do not - // point back to main autodiff stack. if (!local_args_tuple_scope_.args_tuple_holder_) { - // First time: copy shared arguments into reducer-scoped space. local_args_tuple_scope_.stack_.execute([&]() { - math::apply( - [&](auto&&... args) { - local_args_tuple_scope_.args_tuple_holder_ = - std::make_unique( - deep_copy_vars(args)...); - }, - args_tuple_); + apply_ptr_tuple([&](auto const&... a) { + local_args_tuple_scope_.args_tuple_holder_ = + std::make_unique( + deep_copy_vars(a)...); + }, args_ptrs_); }); } else { - // Subsequent calls: reset adjoints of shared arguments. local_args_tuple_scope_.stack_.execute([] { set_zero_all_adjoints(); }); } auto& args_tuple_local = *(local_args_tuple_scope_.args_tuple_holder_); - // Start a nested autodiff scope for this chunk. const nested_rev_autodiff begin_nest; - // Copy sliced terms into this nested stack. - std::decay_t local_sub_slice; - local_sub_slice.reserve(end - begin); + // Reuse per-worker buffer + const std::size_t n = end - begin; + local_sub_slice_.clear(); + if (reserved_ < n) { + local_sub_slice_.reserve(n); + reserved_ = n; + } + for (std::size_t i = begin; i < end; ++i) { - local_sub_slice.emplace_back(deep_copy_vars(vmapped_[i])); + local_sub_slice_.emplace_back(deep_copy_vars((*vmapped_)[i])); } - // Run user reducer over [begin, end-1] + std::ostream* local_msgs = + msgs_ ? static_cast(msgs_.get()) : nullptr; + var sub_sum_v = math::apply( - [&](auto&&... args) { - return ReduceFunction()(local_sub_slice, begin, end - 1, &msgs_, - args...); + [&](auto&&... args_local) { + return ReduceFunction()(local_sub_slice_, begin, end - 1, local_msgs, + args_local...); }, args_tuple_local); - // Compute partial derivatives within this nested stack. sub_sum_v.grad(); - - // Accumulate value. sum_ += sub_sum_v.val(); - // Accumulate adjoints of sliced arguments into the shared partials array. accumulate_adjoints(sliced_partials_ + begin * num_vars_per_term_, - std::move(local_sub_slice)); + std::move(local_sub_slice_)); + + // local_sub_slice_ got moved-from; restore it to a valid empty state + local_sub_slice_.clear(); - // Accumulate adjoints of shared arguments into this reducer's buffer. math::apply( - [&](auto&&... args) { - accumulate_adjoints(args_adjoints_.data(), args...); + [&](auto&&... args_local) { + accumulate_adjoints(args_adjoints_.data(), args_local...); }, args_tuple_local); } @@ -133,114 +147,111 @@ struct reduce_sum_impl, ReturnType, Ve inline var operator()(Vec&& vmapped, bool /*auto_partitioning*/, int grainsize, std::ostream* msgs, Args&&... args) const { - if (vmapped.empty()) { - return var(0.0); - } + if (vmapped.empty()) return var(0.0); const std::size_t num_terms = vmapped.size(); const std::size_t num_vars_per_term = count_vars(vmapped[0]); const std::size_t num_vars_sliced_terms = num_terms * num_vars_per_term; const std::size_t num_vars_shared_terms = count_vars(args...); - vari** varis - = ChainableStack::instance_->memalloc_.alloc_array( + vari** varis = + ChainableStack::instance_->memalloc_.alloc_array( num_vars_sliced_terms + num_vars_shared_terms); - double* partials - = ChainableStack::instance_->memalloc_.alloc_array( + double* partials = + ChainableStack::instance_->memalloc_.alloc_array( num_vars_sliced_terms + num_vars_shared_terms); save_varis(varis, vmapped); save_varis(varis + num_vars_sliced_terms, args...); - for (std::size_t i = 0; i < num_vars_sliced_terms; ++i) { - partials[i] = 0.0; - } + for (std::size_t i = 0; i < num_vars_sliced_terms; ++i) partials[i] = 0.0; + + auto& pool = stan::math::TeamThreadPool::instance(); + const std::size_t max_team = pool.team_size(); - // Decide chunk size (grainsize) and workers. - auto& pool = stan::math::SimpleThreadPool::instance(); - std::size_t pool_threads = pool.thread_count(); - if (pool_threads == 0) pool_threads = 1; + // Choose workers. (Caller participates, so total participants = n) + std::size_t n = std::min(max_team, num_terms == 0 ? 1 : num_terms); + if (n < 1) n = 1; + // Chunking: default to ~2 chunks per participant (lower overhead). std::size_t gs; if (grainsize > 0) { gs = static_cast(grainsize); + if (gs < 1) gs = 1; } else { - // Heuristic: target ~8 chunks per worker. - const std::size_t target_chunks = pool_threads * 8; + const std::size_t target_chunks = n * 2; gs = (num_terms + target_chunks - 1) / target_chunks; if (gs < 1) gs = 1; } - // Serial cutoffs. - if (pool_threads == 1 || num_terms <= gs) { - recursive_reducer r(num_vars_per_term, num_vars_shared_terms, - partials, vmapped, args...); + // Serial cutoff: if too few terms, don't parallelize. + if (n == 1 || num_terms <= gs) { + recursive_reducer r(num_vars_per_term, num_vars_shared_terms, partials, + vmapped, msgs, args...); r(0, num_terms); - for (std::size_t i = 0; i < num_vars_shared_terms; ++i) { - partials[num_vars_sliced_terms + i] - = (r.args_adjoints_.size() != 0) ? r.args_adjoints_.coeff(i) : 0.0; - } - if (msgs) { - *msgs << r.msgs_.str(); + // write shared adjoints + if (!r.args_adjoints_.empty()) { + for (std::size_t i = 0; i < num_vars_shared_terms; ++i) { + partials[num_vars_sliced_terms + i] = r.args_adjoints_[i]; + } + } else { + for (std::size_t i = 0; i < num_vars_shared_terms; ++i) { + partials[num_vars_sliced_terms + i] = 0.0; + } } + if (msgs && r.msgs_) *msgs << r.msgs_->str(); + return var(new precomputed_gradients_vari( - r.sum_, num_vars_sliced_terms + num_vars_shared_terms, varis, - partials)); + r.sum_, num_vars_sliced_terms + num_vars_shared_terms, varis, partials)); } - const std::size_t num_workers = std::min(pool_threads, num_terms); - - // One reducer per worker (reused across chunks). This lets each worker reuse - // its deep-copied shared arguments and local stack. + // One reducer per participant (0..n-1) for static partitioning. + // NOTE: we avoid copying vmapped/args by taking references/pointers inside reducer. std::vector> workers; - workers.reserve(num_workers); - for (std::size_t t = 0; t < num_workers; ++t) { + workers.reserve(n); + for (std::size_t tid = 0; tid < n; ++tid) { workers.emplace_back(std::make_unique( - num_vars_per_term, num_vars_shared_terms, partials, vmapped, args...)); + num_vars_per_term, num_vars_shared_terms, partials, + vmapped, msgs, args...)); } - std::atomic next{0}; - - // Dynamic scheduling: each worker pulls the next chunk. - pool.parallel_region(num_workers, [&](std::size_t tid) { - auto& w = *workers[tid]; - while (true) { - const std::size_t begin = - next.fetch_add(gs, std::memory_order_relaxed); - if (begin >= num_terms) break; - const std::size_t end = std::min(begin + gs, num_terms); - w(begin, end); + // Static partition: each participant gets a contiguous block once + pool.parallel_region(n, [&](std::size_t tid) { + const std::size_t b0 = (num_terms * tid) / n; + const std::size_t b1 = (num_terms * (tid + 1)) / n; + if (b0 < b1) { + (*workers[tid])(b0, b1); } }); - // Aggregate results. + // Aggregate double total_sum = 0.0; - Eigen::VectorXd shared_adjoints = - Eigen::VectorXd::Zero(num_vars_shared_terms); + std::vector shared_adj(num_vars_shared_terms, 0.0); std::stringstream all_msgs; for (auto& wptr : workers) { auto& w = *wptr; total_sum += w.sum_; - if (w.args_adjoints_.size() != 0) { - shared_adjoints += w.args_adjoints_; + if (!w.args_adjoints_.empty()) { + for (std::size_t i = 0; i < num_vars_shared_terms; ++i) { + shared_adj[i] += w.args_adjoints_[i]; + } + } + if (msgs && w.msgs_) { + all_msgs << w.msgs_->str(); } - all_msgs << w.msgs_.str(); } for (std::size_t i = 0; i < num_vars_shared_terms; ++i) { - partials[num_vars_sliced_terms + i] = shared_adjoints.coeff(i); + partials[num_vars_sliced_terms + i] = shared_adj[i]; } - if (msgs) { - *msgs << all_msgs.str(); - } + if (msgs) *msgs << all_msgs.str(); return var(new precomputed_gradients_vari( - total_sum, num_vars_sliced_terms + num_vars_shared_terms, varis, - partials)); + total_sum, num_vars_sliced_terms + num_vars_shared_terms, varis, partials)); } }; From a124301f018efb1850f1e9865b42cc1dfcbb4775 Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Mon, 8 Dec 2025 11:07:28 -0500 Subject: [PATCH 04/17] almost working version --- stan/math/rev/core.hpp | 1 + stan/math/rev/core/team_thread_pool.hpp | 87 +++++++++++++++++++++++-- stan/math/rev/functor/reduce_sum.hpp | 9 ++- 3 files changed, 89 insertions(+), 8 deletions(-) diff --git a/stan/math/rev/core.hpp b/stan/math/rev/core.hpp index 9b7646c7d04..0dd91e8d9a6 100644 --- a/stan/math/rev/core.hpp +++ b/stan/math/rev/core.hpp @@ -63,6 +63,7 @@ #include #include #include +#include #include #include #include diff --git a/stan/math/rev/core/team_thread_pool.hpp b/stan/math/rev/core/team_thread_pool.hpp index 8aa7eb419c6..8ccbad64b08 100644 --- a/stan/math/rev/core/team_thread_pool.hpp +++ b/stan/math/rev/core/team_thread_pool.hpp @@ -26,6 +26,16 @@ namespace math { */ class TeamThreadPool { public: + // Call this before first use of TeamThreadPool::instance() + static void set_num_threads(std::size_t n) noexcept { + if (n < 1) n = 1; + user_cap_().store(n, std::memory_order_release); + } + + static std::size_t get_num_threads() noexcept { + return user_cap_().load(std::memory_order_acquire); + } + static TeamThreadPool& instance() { static TeamThreadPool pool; return pool; @@ -42,14 +52,15 @@ class TeamThreadPool { template void parallel_region(std::size_t n, F&& fn) { + //std::cout << "#################### parallel_region, n = " << n << std::endl; if (n == 0) return; // If called from a worker, run serial to avoid nested deadlocks. + //std::cout << "in_worker_ = " << in_worker_ << std::endl; if (in_worker_) { fn(std::size_t{0}); return; } - const std::size_t max_team = team_size(); if (max_team == 1) { fn(std::size_t{0}); @@ -84,14 +95,36 @@ class TeamThreadPool { fn_copy(0); in_worker_ = false; + //std::cout << "waiting for workers" << std::endl; // Wait for workers 1..n-1 std::unique_lock lk(done_m_); done_cv_.wait(lk, [&] { return remaining_.load(std::memory_order_acquire) == 0; }); + //std::cout << "#################### done" << std::endl << std::endl; + } + +private: + // Function-local static avoids static init order fiasco. + static std::atomic& user_cap_() { + static std::atomic cap{0}; // 0 => "unset" + return cap; + } + + static std::size_t configured_cap_(std::size_t hw) { + // priority: user cap > env var > hw + std::size_t cap = user_cap_().load(std::memory_order_acquire); + if (cap == 0) { + cap = env_num_threads_(); // if you have STAN_NUM_THREADS support + } + if (cap == 0) cap = hw; + + if (cap < 1) cap = 1; + if (cap > hw) cap = hw; // prevent oversubscription by default + return cap; } - private: + using call_fn_t = void (*)(void*, std::size_t); template @@ -99,15 +132,53 @@ class TeamThreadPool { (*static_cast(ctx))(tid); } + static size_t env_num_threads_() { + size_t num_threads = 1; +#ifdef STAN_THREADS + const char* env_stan_num_threads = std::getenv("STAN_NUM_THREADS"); + if (env_stan_num_threads != nullptr) { + try { + const int env_num_threads + = boost::lexical_cast(env_stan_num_threads); + if (env_num_threads > 0) { + num_threads = env_num_threads; + } else if (env_num_threads == -1) { + num_threads = std::thread::hardware_concurrency(); + } else { + invalid_argument("get_num_threads(int)", "STAN_NUM_THREADS", + env_stan_num_threads, + "The STAN_NUM_THREADS environment variable is '", + "' but it must be positive or -1"); + } + } catch (const boost::bad_lexical_cast&) { + invalid_argument("get_num_threads(int)", "STAN_NUM_THREADS", + env_stan_num_threads, + "The STAN_NUM_THREADS environment variable is '", + "' but it must be a positive number or -1"); + } + } +#endif + return num_threads; +} + + TeamThreadPool() : stop_(false), epoch_(0), region_n_(0), region_ctx_(nullptr), region_call_(nullptr), remaining_(0) { - unsigned hw = std::thread::hardware_concurrency(); - if (hw == 0) hw = 2; - - // hw-1 worker threads; caller is +1 participant. - const unsigned num_workers = (hw > 1) ? (hw - 1) : 1; + unsigned hw_u = std::thread::hardware_concurrency(); + if (hw_u == 0) hw_u = 2; + const std::size_t hw = static_cast(hw_u); + + const std::size_t cap = configured_cap_(hw); + const std::size_t num_workers = (cap > 1) ? (cap - 1) : 0; + + std::cout << "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" << std::endl + << "hw = " << hw << std::endl + << "num_workers = " << num_workers << std::endl + << "cap = " << cap << std::endl + << std::endl << std::endl; + workers_.reserve(num_workers); for (unsigned i = 0; i < num_workers; ++i) { const std::size_t tid = static_cast(i + 1); // workers are 1..N @@ -151,7 +222,9 @@ class TeamThreadPool { in_worker_ = false; }); } + std::cout << "done with constructor" << std::endl; } + ~TeamThreadPool() { stop_.store(true, std::memory_order_release); diff --git a/stan/math/rev/functor/reduce_sum.hpp b/stan/math/rev/functor/reduce_sum.hpp index a41f12334f5..f3b9dd0136c 100644 --- a/stan/math/rev/functor/reduce_sum.hpp +++ b/stan/math/rev/functor/reduce_sum.hpp @@ -216,7 +216,14 @@ struct reduce_sum_impl, ReturnType, Ve num_vars_per_term, num_vars_shared_terms, partials, vmapped, msgs, args...)); } - + /* + std::cout << "--------------------------------------------------------------------------------" << std::endl + << "worker count = " << pool.worker_count() << std::endl + << "team size = " << pool.team_size() << std::endl + << "gs = " << gs << std::endl + << std::endl << std::endl; + */ + // Static partition: each participant gets a contiguous block once pool.parallel_region(n, [&](std::size_t tid) { const std::size_t b0 = (num_terms * tid) / n; From 99c269bc059cb30fc1d1423324d62c90a4079dcb Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Mon, 8 Dec 2025 11:24:58 -0500 Subject: [PATCH 05/17] better implementation, still hangs --- stan/math/rev/core/team_thread_pool.hpp | 192 +++++++++++++----------- 1 file changed, 101 insertions(+), 91 deletions(-) diff --git a/stan/math/rev/core/team_thread_pool.hpp b/stan/math/rev/core/team_thread_pool.hpp index 8ccbad64b08..b4894b96841 100644 --- a/stan/math/rev/core/team_thread_pool.hpp +++ b/stan/math/rev/core/team_thread_pool.hpp @@ -6,8 +6,11 @@ #include #include #include +#include +#include #include #include +#include #include #include @@ -17,16 +20,16 @@ namespace math { /** * Team (epoch) thread pool for low-overhead parallel regions. * - * - Creates (hw-1) worker threads once. - * - Caller participates with tid=0. + * - Workers are created once. + * - Caller participates as tid=0. * - parallel_region(n, fn): runs fn(tid) for tid in [0, n). * - Nested parallelism: if called from a worker thread, runs serial. - * - * Designed for reduce_sum/map_rect style internal parallelism. + * - set_num_threads(k) must be called before instance() to size the pool. */ class TeamThreadPool { public: - // Call this before first use of TeamThreadPool::instance() + // Call before first instance() to control pool size. + // Meaning: total participants INCLUDING caller (tid=0). static void set_num_threads(std::size_t n) noexcept { if (n < 1) n = 1; user_cap_().store(n, std::memory_order_release); @@ -41,26 +44,25 @@ class TeamThreadPool { return pool; } - TeamThreadPool(const TeamThreadPool&) = delete; - TeamThreadPool& operator=(const TeamThreadPool&) = delete; - - // Number of worker threads (excluding caller) + // Worker threads (excluding caller) std::size_t worker_count() const noexcept { return workers_.size(); } - // Total participants available = worker_count + 1 (caller) + // Total possible participants INCLUDING caller std::size_t team_size() const noexcept { return workers_.size() + 1; } template void parallel_region(std::size_t n, F&& fn) { - //std::cout << "#################### parallel_region, n = " << n << std::endl; if (n == 0) return; - // If called from a worker, run serial to avoid nested deadlocks. - //std::cout << "in_worker_ = " << in_worker_ << std::endl; + // Nested parallelism guard: if already on a worker, run serial. if (in_worker_) { fn(std::size_t{0}); return; } + + // Only one active region at a time (this is required for a single shared epoch design). + std::unique_lock region_lock(region_m_); + const std::size_t max_team = team_size(); if (max_team == 1) { fn(std::size_t{0}); @@ -72,11 +74,17 @@ class TeamThreadPool { return; } - // Stable storage for callable during this region using Fn = std::decay_t; - Fn fn_copy = std::forward(fn); + Fn fn_copy = std::forward(fn); // stable storage during this call + + // Exception propagation: capture first exception from any participant. + std::exception_ptr eptr = nullptr; + { + std::lock_guard lk(exc_m_); + exc_ptr_ = &eptr; + } - // Publish region + // Publish region state BEFORE bumping epoch. remaining_.store(n - 1, std::memory_order_release); // workers only region_n_.store(n, std::memory_order_release); region_ctx_.store(static_cast(&fn_copy), std::memory_order_release); @@ -92,39 +100,28 @@ class TeamThreadPool { // Caller participates as tid=0 in_worker_ = true; - fn_copy(0); + try { + fn_copy(0); + } catch (...) { + std::lock_guard lk(exc_m_); + if (eptr == nullptr) eptr = std::current_exception(); + } in_worker_ = false; - //std::cout << "waiting for workers" << std::endl; // Wait for workers 1..n-1 std::unique_lock lk(done_m_); done_cv_.wait(lk, [&] { return remaining_.load(std::memory_order_acquire) == 0; }); - //std::cout << "#################### done" << std::endl << std::endl; - } - -private: - // Function-local static avoids static init order fiasco. - static std::atomic& user_cap_() { - static std::atomic cap{0}; // 0 => "unset" - return cap; - } - static std::size_t configured_cap_(std::size_t hw) { - // priority: user cap > env var > hw - std::size_t cap = user_cap_().load(std::memory_order_acquire); - if (cap == 0) { - cap = env_num_threads_(); // if you have STAN_NUM_THREADS support - } - if (cap == 0) cap = hw; + // Clear region participation; not strictly necessary but good hygiene. + region_n_.store(0, std::memory_order_release); - if (cap < 1) cap = 1; - if (cap > hw) cap = hw; // prevent oversubscription by default - return cap; + // Rethrow exception (if any) + if (eptr) std::rethrow_exception(eptr); } - + private: using call_fn_t = void (*)(void*, std::size_t); template @@ -132,64 +129,57 @@ class TeamThreadPool { (*static_cast(ctx))(tid); } - static size_t env_num_threads_() { - size_t num_threads = 1; -#ifdef STAN_THREADS - const char* env_stan_num_threads = std::getenv("STAN_NUM_THREADS"); - if (env_stan_num_threads != nullptr) { - try { - const int env_num_threads - = boost::lexical_cast(env_stan_num_threads); - if (env_num_threads > 0) { - num_threads = env_num_threads; - } else if (env_num_threads == -1) { - num_threads = std::thread::hardware_concurrency(); - } else { - invalid_argument("get_num_threads(int)", "STAN_NUM_THREADS", - env_stan_num_threads, - "The STAN_NUM_THREADS environment variable is '", - "' but it must be positive or -1"); - } - } catch (const boost::bad_lexical_cast&) { - invalid_argument("get_num_threads(int)", "STAN_NUM_THREADS", - env_stan_num_threads, - "The STAN_NUM_THREADS environment variable is '", - "' but it must be a positive number or -1"); - } - } -#endif - return num_threads; -} + // Function-local static avoids static initialization order issues. + static std::atomic& user_cap_() { + static std::atomic cap{0}; // 0 => unset + return cap; + } + static std::size_t env_num_threads_() noexcept { + const char* s = std::getenv("STAN_NUM_THREADS"); + if (!s || !*s) return 0; + char* end = nullptr; + long v = std::strtol(s, &end, 10); + if (end == s || v <= 0) return 0; + return static_cast(v); + } - TeamThreadPool() - : stop_(false), epoch_(0), region_n_(0), region_ctx_(nullptr), - region_call_(nullptr), remaining_(0) { + static std::size_t configured_cap_(std::size_t hw) noexcept { + // Priority: explicit set_num_threads > STAN_NUM_THREADS > hw + std::size_t cap = user_cap_().load(std::memory_order_acquire); + if (cap == 0) cap = env_num_threads_(); + if (cap == 0) cap = hw; + if (cap < 1) cap = 1; + if (cap > hw) cap = hw; // don’t oversubscribe by default + return cap; + } + + TeamThreadPool() + : stop_(false), + epoch_(0), + region_n_(0), + region_ctx_(nullptr), + region_call_(nullptr), + remaining_(0), + exc_ptr_(nullptr) { unsigned hw_u = std::thread::hardware_concurrency(); if (hw_u == 0) hw_u = 2; const std::size_t hw = static_cast(hw_u); - + + // Total participants includes caller. const std::size_t cap = configured_cap_(hw); const std::size_t num_workers = (cap > 1) ? (cap - 1) : 0; - std::cout << "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" << std::endl - << "hw = " << hw << std::endl - << "num_workers = " << num_workers << std::endl - << "cap = " << cap << std::endl - << std::endl << std::endl; - workers_.reserve(num_workers); - for (unsigned i = 0; i < num_workers; ++i) { - const std::size_t tid = static_cast(i + 1); // workers are 1..N + for (std::size_t i = 0; i < num_workers; ++i) { + const std::size_t tid = i + 1; // workers are 1..num_workers workers_.emplace_back([this, tid] { - // Per-worker AD tape initialized once static thread_local ChainableStack ad_tape; in_worker_ = true; std::size_t seen = epoch_.load(std::memory_order_acquire); for (;;) { - // Sleep until epoch changes or stop requested { std::unique_lock lk(wake_m_); wake_cv_.wait(lk, [&] { @@ -203,28 +193,41 @@ class TeamThreadPool { seen = e; const std::size_t n = region_n_.load(std::memory_order_acquire); - if (tid >= n) { - continue; // not participating this region - } + if (tid >= n) continue; // not participating this region + + // Ensure we ALWAYS decrement remaining_ once for participating workers. + struct DoneGuard { + std::atomic& rem; + std::mutex& m; + std::condition_variable& cv; + bool active{true}; + ~DoneGuard() { + if (!active) return; + if (rem.fetch_sub(1, std::memory_order_acq_rel) == 1) { + std::lock_guard lk(m); + cv.notify_one(); + } + } + } guard{remaining_, done_m_, done_cv_}; void* ctx = region_ctx_.load(std::memory_order_acquire); call_fn_t call = region_call_.load(std::memory_order_acquire); - if (call) { - call(ctx, tid); - } + if (!call) continue; - if (remaining_.fetch_sub(1, std::memory_order_acq_rel) == 1) { - std::lock_guard lk(done_m_); - done_cv_.notify_one(); + try { + call(ctx, tid); + } catch (...) { + std::lock_guard lk(exc_m_); + if (exc_ptr_ && *exc_ptr_ == nullptr) { + *exc_ptr_ = std::current_exception(); + } } } in_worker_ = false; }); } - std::cout << "done with constructor" << std::endl; } - ~TeamThreadPool() { stop_.store(true, std::memory_order_release); @@ -242,6 +245,9 @@ class TeamThreadPool { std::vector workers_; std::atomic stop_; + // Serialize regions (single shared-region design) + std::mutex region_m_; + // Region publication std::atomic epoch_; std::atomic region_n_; @@ -256,6 +262,10 @@ class TeamThreadPool { std::atomic remaining_; std::mutex done_m_; std::condition_variable done_cv_; + + // Exception plumbing + std::mutex exc_m_; + std::exception_ptr* exc_ptr_; }; } // namespace math From 152610169b937338c80627c74b93b2694e07af7d Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Fri, 12 Dec 2025 09:21:53 -0500 Subject: [PATCH 06/17] updating team_thread_pool.hpp --- stan/math/rev/core/team_thread_pool.hpp | 267 +++++++++++++++++++----- 1 file changed, 213 insertions(+), 54 deletions(-) diff --git a/stan/math/rev/core/team_thread_pool.hpp b/stan/math/rev/core/team_thread_pool.hpp index b4894b96841..a3726d5e95a 100644 --- a/stan/math/rev/core/team_thread_pool.hpp +++ b/stan/math/rev/core/team_thread_pool.hpp @@ -6,30 +6,47 @@ #include #include #include -#include -#include +#include // fprintf, fflush +#include // getenv, strtol +#include // exception_ptr #include #include #include #include #include +// Debug mode: periodic dumps while waiting. +// #define STAN_TEAM_POOL_DEBUG_WAIT 1 + +#if defined(STAN_TEAM_POOL_DEBUG_WAIT) +#include +#endif + namespace stan { namespace math { /** * Team (epoch) thread pool for low-overhead parallel regions. * - * - Workers are created once. - * - Caller participates as tid=0. - * - parallel_region(n, fn): runs fn(tid) for tid in [0, n). - * - Nested parallelism: if called from a worker thread, runs serial. - * - set_num_threads(k) must be called before instance() to size the pool. + * Logical tids: + * - caller thread: tid=0 + * - worker threads: tid=1..cap-1 + * + * Key correctness points: + * - Single shared "region" state => serialize parallel_region() with region_m_. + * - Wake generation (wake_gen_) protected by wake_m_ to prevent missed wakeups. + * - Startup barrier ensures all workers are "armed" on wake_gen_ before first use, + * preventing late-start workers from missing the first region. + * + * Debug fields per logical tid: + * - alive[tid] : 1 once the worker thread started (0 means never started / died early) + * - seen[tid] : last epoch observed by that worker (0 means never saw any region) + * - exec[tid] : epoch currently executing (0 means not executing region work) + * - dec[tid] : count of decrements performed by that worker */ class TeamThreadPool { public: - // Call before first instance() to control pool size. - // Meaning: total participants INCLUDING caller (tid=0). + // Total participants INCLUDING caller (tid=0). Call before instance(). static void set_num_threads(std::size_t n) noexcept { if (n < 1) n = 1; user_cap_().store(n, std::memory_order_release); @@ -44,23 +61,58 @@ class TeamThreadPool { return pool; } - // Worker threads (excluding caller) std::size_t worker_count() const noexcept { return workers_.size(); } - - // Total possible participants INCLUDING caller std::size_t team_size() const noexcept { return workers_.size() + 1; } + void dump_state(const char* tag = "TeamThreadPool") const { + const auto epoch = epoch_.load(std::memory_order_acquire); + const auto n = region_n_.load(std::memory_order_acquire); + const auto rem = remaining_.load(std::memory_order_acquire); + + std::fprintf(stderr, + "\n[%s] epoch=%zu region_n=%zu remaining=%zu team_size=%zu wake_gen=%zu ready=%zu/%zu\n", + tag, epoch, n, rem, team_size(), wake_gen_snapshot_(), + ready_count_.load(std::memory_order_acquire), + workers_.size()); + std::fprintf(stderr, " tid | alive | seen | exec | dec | note\n"); + std::fprintf(stderr, " ----+-------+-------+-------+-----+-----------------------------\n"); + + for (std::size_t tid = 1; tid < worker_state_size_; ++tid) { + const unsigned alive = worker_alive_[tid].load(std::memory_order_acquire); + const std::size_t seen = worker_seen_epoch_[tid].load(std::memory_order_acquire); + const std::size_t exec = worker_exec_epoch_[tid].load(std::memory_order_acquire); + const std::size_t dec = worker_decrement_count_[tid].load(std::memory_order_acquire); + + const char* note = ""; + if (!alive) { + note = "NOT ALIVE"; + } else if (tid < n) { + if (seen < epoch) note = "participating but hasn't seen epoch"; + else if (exec == epoch) note = "executing"; + else if (exec != 0) note = "executing (old epoch?)"; + else note = "idle/finished"; + } else { + if (seen == epoch) note = "saw epoch but not participating"; + else note = "not participating"; + } + + std::fprintf(stderr, " %3zu | %u | %5zu | %5zu | %3zu | %s\n", + tid, alive, seen, exec, dec, note); + } + std::fflush(stderr); + } + template void parallel_region(std::size_t n, F&& fn) { if (n == 0) return; - // Nested parallelism guard: if already on a worker, run serial. + // Nested parallelism guard. if (in_worker_) { fn(std::size_t{0}); return; } - // Only one active region at a time (this is required for a single shared epoch design). + // Single shared region state => serialize launches. std::unique_lock region_lock(region_m_); const std::size_t max_team = team_size(); @@ -75,9 +127,9 @@ class TeamThreadPool { } using Fn = std::decay_t; - Fn fn_copy = std::forward(fn); // stable storage during this call + Fn fn_copy = std::forward(fn); - // Exception propagation: capture first exception from any participant. + // Exception propagation (first exception wins). std::exception_ptr eptr = nullptr; { std::lock_guard lk(exc_m_); @@ -85,20 +137,27 @@ class TeamThreadPool { } // Publish region state BEFORE bumping epoch. - remaining_.store(n - 1, std::memory_order_release); // workers only + remaining_.store(n - 1, std::memory_order_release); region_n_.store(n, std::memory_order_release); region_ctx_.store(static_cast(&fn_copy), std::memory_order_release); region_call_.store(&call_impl, std::memory_order_release); - epoch_.fetch_add(1, std::memory_order_acq_rel); + const std::size_t new_epoch = + epoch_.fetch_add(1, std::memory_order_acq_rel) + 1; - // Wake workers + // std::fprintf(stderr, + // "\n[TeamThreadPool(launch)] epoch=%zu n=%zu expected_workers=%zu team_size=%zu\n", + // new_epoch, n, n - 1, team_size()); + // std::fflush(stderr); + + // Wake workers using wake generation (prevents missed wakeups). { std::lock_guard lk(wake_m_); + ++wake_gen_; } wake_cv_.notify_all(); - // Caller participates as tid=0 + // Caller participates (tid=0). in_worker_ = true; try { fn_copy(0); @@ -108,16 +167,31 @@ class TeamThreadPool { } in_worker_ = false; - // Wait for workers 1..n-1 + // Wait for workers 1..n-1. std::unique_lock lk(done_m_); +#if defined(STAN_TEAM_POOL_DEBUG_WAIT) + auto last_dump = std::chrono::steady_clock::now(); + while (remaining_.load(std::memory_order_acquire) != 0) { + done_cv_.wait_for(lk, std::chrono::milliseconds(250)); + const auto now = std::chrono::steady_clock::now(); + if (now - last_dump > std::chrono::seconds(2) + && remaining_.load(std::memory_order_acquire) != 0) { + std::fprintf(stderr, + "[TeamThreadPool] waiting too long for epoch=%zu (remaining=%zu)\n", + new_epoch, remaining_.load(std::memory_order_acquire)); + dump_state("TeamThreadPool(wait)"); + last_dump = now; + } + } +#else done_cv_.wait(lk, [&] { return remaining_.load(std::memory_order_acquire) == 0; }); +#endif - // Clear region participation; not strictly necessary but good hygiene. + // Hygiene. region_n_.store(0, std::memory_order_release); - // Rethrow exception (if any) if (eptr) std::rethrow_exception(eptr); } @@ -129,9 +203,8 @@ class TeamThreadPool { (*static_cast(ctx))(tid); } - // Function-local static avoids static initialization order issues. static std::atomic& user_cap_() { - static std::atomic cap{0}; // 0 => unset + static std::atomic cap{0}; return cap; } @@ -145,16 +218,19 @@ class TeamThreadPool { } static std::size_t configured_cap_(std::size_t hw) noexcept { - // Priority: explicit set_num_threads > STAN_NUM_THREADS > hw std::size_t cap = user_cap_().load(std::memory_order_acquire); if (cap == 0) cap = env_num_threads_(); if (cap == 0) cap = hw; - if (cap < 1) cap = 1; - if (cap > hw) cap = hw; // don’t oversubscribe by default + if (cap > hw) cap = hw; return cap; } + std::size_t wake_gen_snapshot_() const { + std::lock_guard lk(wake_m_); + return wake_gen_; + } + TeamThreadPool() : stop_(false), epoch_(0), @@ -162,79 +238,149 @@ class TeamThreadPool { region_ctx_(nullptr), region_call_(nullptr), remaining_(0), - exc_ptr_(nullptr) { + exc_ptr_(nullptr), + wake_gen_(0), + ready_count_(0) { unsigned hw_u = std::thread::hardware_concurrency(); if (hw_u == 0) hw_u = 2; const std::size_t hw = static_cast(hw_u); - // Total participants includes caller. const std::size_t cap = configured_cap_(hw); const std::size_t num_workers = (cap > 1) ? (cap - 1) : 0; + worker_state_size_ = cap; + + // raw arrays so atomics aren't moved + worker_alive_.reset(new std::atomic[cap]); + worker_seen_epoch_.reset(new std::atomic[cap]); + worker_exec_epoch_.reset(new std::atomic[cap]); + worker_decrement_count_.reset(new std::atomic[cap]); + + for (std::size_t i = 0; i < cap; ++i) { + worker_alive_[i].store(0u, std::memory_order_relaxed); + worker_seen_epoch_[i].store(0, std::memory_order_relaxed); + worker_exec_epoch_[i].store(0, std::memory_order_relaxed); + worker_decrement_count_[i].store(0, std::memory_order_relaxed); + } + + std::fprintf(stderr, + "[TeamThreadPool(ctor)] cap=%zu (workers=%zu) hw=%zu\n", + cap, num_workers, hw); + std::fflush(stderr); + workers_.reserve(num_workers); for (std::size_t i = 0; i < num_workers; ++i) { - const std::size_t tid = i + 1; // workers are 1..num_workers + const std::size_t tid = i + 1; workers_.emplace_back([this, tid] { static thread_local ChainableStack ad_tape; in_worker_ = true; - std::size_t seen = epoch_.load(std::memory_order_acquire); + if (tid < worker_state_size_) { + worker_alive_[tid].store(1u, std::memory_order_release); + } + + // "Arm" this worker on the current wake generation so it can't miss + // the first region wake. + std::size_t local_gen; + { + std::unique_lock lk(wake_m_); + local_gen = wake_gen_; + } + + // Signal readiness AFTER arming. + ready_count_.fetch_add(1, std::memory_order_acq_rel); + { + std::lock_guard lk(ready_m_); + } + ready_cv_.notify_one(); + for (;;) { + // Wait for wake_gen_ to change (or stop_). { std::unique_lock lk(wake_m_); wake_cv_.wait(lk, [&] { return stop_.load(std::memory_order_acquire) - || epoch_.load(std::memory_order_acquire) != seen; + || wake_gen_ != local_gen; }); + if (stop_.load(std::memory_order_acquire)) break; + local_gen = wake_gen_; } - if (stop_.load(std::memory_order_acquire)) break; + // Observe epoch and region parameters after wake. const std::size_t e = epoch_.load(std::memory_order_acquire); - seen = e; + + if (tid < worker_state_size_) { + worker_seen_epoch_[tid].store(e, std::memory_order_release); + } const std::size_t n = region_n_.load(std::memory_order_acquire); - if (tid >= n) continue; // not participating this region + if (tid >= n) { + continue; + } - // Ensure we ALWAYS decrement remaining_ once for participating workers. + // Always decrement once for participating workers. struct DoneGuard { std::atomic& rem; std::mutex& m; std::condition_variable& cv; - bool active{true}; + std::atomic& dec_count; ~DoneGuard() { - if (!active) return; + dec_count.fetch_add(1, std::memory_order_relaxed); if (rem.fetch_sub(1, std::memory_order_acq_rel) == 1) { std::lock_guard lk(m); cv.notify_one(); } } - } guard{remaining_, done_m_, done_cv_}; + } guard{remaining_, done_m_, done_cv_, worker_decrement_count_[tid]}; + + if (tid < worker_state_size_) { + worker_exec_epoch_[tid].store(e, std::memory_order_release); + } void* ctx = region_ctx_.load(std::memory_order_acquire); call_fn_t call = region_call_.load(std::memory_order_acquire); - if (!call) continue; - - try { - call(ctx, tid); - } catch (...) { - std::lock_guard lk(exc_m_); - if (exc_ptr_ && *exc_ptr_ == nullptr) { - *exc_ptr_ = std::current_exception(); + + if (call) { + try { + call(ctx, tid); + } catch (...) { + std::lock_guard lk(exc_m_); + if (exc_ptr_ && *exc_ptr_ == nullptr) { + *exc_ptr_ = std::current_exception(); + } } } + + if (tid < worker_state_size_) { + worker_exec_epoch_[tid].store(0, std::memory_order_release); + } } in_worker_ = false; }); } + + // Startup barrier: ensure all workers are armed and waiting-ready. + { + std::unique_lock lk(ready_m_); + ready_cv_.wait(lk, [&] { + return ready_count_.load(std::memory_order_acquire) == workers_.size(); + }); + } + std::fprintf(stderr, "[TeamThreadPool(ctor)] all workers ready: %zu\n", + workers_.size()); + std::fflush(stderr); } ~TeamThreadPool() { stop_.store(true, std::memory_order_release); { + // bump wake_gen_ so workers wake and see stop_ std::lock_guard lk(wake_m_); + ++wake_gen_; } wake_cv_.notify_all(); + for (auto& t : workers_) { if (t.joinable()) t.join(); } @@ -245,27 +391,40 @@ class TeamThreadPool { std::vector workers_; std::atomic stop_; - // Serialize regions (single shared-region design) + // Serialize regions. std::mutex region_m_; - // Region publication + // Region publication. std::atomic epoch_; std::atomic region_n_; std::atomic region_ctx_; std::atomic region_call_; - // Worker wake - std::mutex wake_m_; + // Wake workers (wake_gen_ is protected by wake_m_). + mutable std::mutex wake_m_; std::condition_variable wake_cv_; + std::size_t wake_gen_; - // Completion + // Startup barrier. + std::mutex ready_m_; + std::condition_variable ready_cv_; + std::atomic ready_count_; + + // Completion. std::atomic remaining_; std::mutex done_m_; std::condition_variable done_cv_; - // Exception plumbing + // Exceptions. std::mutex exc_m_; std::exception_ptr* exc_ptr_; + + // Debug state (arrays of atomics to avoid std::vector> move issues). + std::size_t worker_state_size_{0}; + std::unique_ptr[]> worker_alive_; + std::unique_ptr[]> worker_seen_epoch_; + std::unique_ptr[]> worker_exec_epoch_; + std::unique_ptr[]> worker_decrement_count_; }; } // namespace math From 02589232b4002e8063ef05a0142ebd6b99ee793e Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Fri, 12 Dec 2025 20:09:23 -0500 Subject: [PATCH 07/17] updates to team_thread_pool.hpp --- stan/math/rev/core/team_thread_pool.hpp | 212 +++++------------------- 1 file changed, 38 insertions(+), 174 deletions(-) diff --git a/stan/math/rev/core/team_thread_pool.hpp b/stan/math/rev/core/team_thread_pool.hpp index a3726d5e95a..538006bde81 100644 --- a/stan/math/rev/core/team_thread_pool.hpp +++ b/stan/math/rev/core/team_thread_pool.hpp @@ -6,7 +6,6 @@ #include #include #include -#include // fprintf, fflush #include // getenv, strtol #include // exception_ptr #include @@ -15,34 +14,21 @@ #include #include -// Debug mode: periodic dumps while waiting. -// #define STAN_TEAM_POOL_DEBUG_WAIT 1 - -#if defined(STAN_TEAM_POOL_DEBUG_WAIT) -#include -#endif - namespace stan { namespace math { /** - * Team (epoch) thread pool for low-overhead parallel regions. + * TeamThreadPool * - * Logical tids: - * - caller thread: tid=0 - * - worker threads: tid=1..cap-1 + * - Fixed set of worker threads created once. + * - Caller participates as logical tid=0. + * - Worker threads have stable logical tids 1..(cap-1). + * - parallel_region(n, fn): runs fn(tid) for tid in [0, n), exactly once each. * - * Key correctness points: - * - Single shared "region" state => serialize parallel_region() with region_m_. - * - Wake generation (wake_gen_) protected by wake_m_ to prevent missed wakeups. - * - Startup barrier ensures all workers are "armed" on wake_gen_ before first use, - * preventing late-start workers from missing the first region. - * - * Debug fields per logical tid: - * - alive[tid] : 1 once the worker thread started (0 means never started / died early) - * - seen[tid] : last epoch observed by that worker (0 means never saw any region) - * - exec[tid] : epoch currently executing (0 means not executing region work) - * - dec[tid] : count of decrements performed by that worker + * Notes: + * - Nested parallel_region calls from a worker run serially to avoid deadlock. + * - Uses an epoch counter + condition_variable to wake workers per region. + * - Startup barrier ensures all workers are waiting before the first region launch. */ class TeamThreadPool { public: @@ -64,59 +50,21 @@ class TeamThreadPool { std::size_t worker_count() const noexcept { return workers_.size(); } std::size_t team_size() const noexcept { return workers_.size() + 1; } - void dump_state(const char* tag = "TeamThreadPool") const { - const auto epoch = epoch_.load(std::memory_order_acquire); - const auto n = region_n_.load(std::memory_order_acquire); - const auto rem = remaining_.load(std::memory_order_acquire); - - std::fprintf(stderr, - "\n[%s] epoch=%zu region_n=%zu remaining=%zu team_size=%zu wake_gen=%zu ready=%zu/%zu\n", - tag, epoch, n, rem, team_size(), wake_gen_snapshot_(), - ready_count_.load(std::memory_order_acquire), - workers_.size()); - std::fprintf(stderr, " tid | alive | seen | exec | dec | note\n"); - std::fprintf(stderr, " ----+-------+-------+-------+-----+-----------------------------\n"); - - for (std::size_t tid = 1; tid < worker_state_size_; ++tid) { - const unsigned alive = worker_alive_[tid].load(std::memory_order_acquire); - const std::size_t seen = worker_seen_epoch_[tid].load(std::memory_order_acquire); - const std::size_t exec = worker_exec_epoch_[tid].load(std::memory_order_acquire); - const std::size_t dec = worker_decrement_count_[tid].load(std::memory_order_acquire); - - const char* note = ""; - if (!alive) { - note = "NOT ALIVE"; - } else if (tid < n) { - if (seen < epoch) note = "participating but hasn't seen epoch"; - else if (exec == epoch) note = "executing"; - else if (exec != 0) note = "executing (old epoch?)"; - else note = "idle/finished"; - } else { - if (seen == epoch) note = "saw epoch but not participating"; - else note = "not participating"; - } - - std::fprintf(stderr, " %3zu | %u | %5zu | %5zu | %3zu | %s\n", - tid, alive, seen, exec, dec, note); - } - std::fflush(stderr); - } - template void parallel_region(std::size_t n, F&& fn) { if (n == 0) return; - // Nested parallelism guard. + // Prevent nested parallelism from deadlocking the pool. if (in_worker_) { fn(std::size_t{0}); return; } - // Single shared region state => serialize launches. + // Only one active region at a time (shared region state). std::unique_lock region_lock(region_m_); const std::size_t max_team = team_size(); - if (max_team == 1) { + if (max_team <= 1) { fn(std::size_t{0}); return; } @@ -137,27 +85,23 @@ class TeamThreadPool { } // Publish region state BEFORE bumping epoch. - remaining_.store(n - 1, std::memory_order_release); + remaining_.store(n - 1, std::memory_order_release); // workers only region_n_.store(n, std::memory_order_release); region_ctx_.store(static_cast(&fn_copy), std::memory_order_release); region_call_.store(&call_impl, std::memory_order_release); + // Bump epoch to start the region, then wake workers. const std::size_t new_epoch = epoch_.fetch_add(1, std::memory_order_acq_rel) + 1; - // std::fprintf(stderr, - // "\n[TeamThreadPool(launch)] epoch=%zu n=%zu expected_workers=%zu team_size=%zu\n", - // new_epoch, n, n - 1, team_size()); - // std::fflush(stderr); - - // Wake workers using wake generation (prevents missed wakeups). { std::lock_guard lk(wake_m_); - ++wake_gen_; + // epoch_ already updated; the mutex pairs with the cv wait. + (void)new_epoch; } wake_cv_.notify_all(); - // Caller participates (tid=0). + // Caller participates as tid=0. in_worker_ = true; try { fn_copy(0); @@ -169,25 +113,9 @@ class TeamThreadPool { // Wait for workers 1..n-1. std::unique_lock lk(done_m_); -#if defined(STAN_TEAM_POOL_DEBUG_WAIT) - auto last_dump = std::chrono::steady_clock::now(); - while (remaining_.load(std::memory_order_acquire) != 0) { - done_cv_.wait_for(lk, std::chrono::milliseconds(250)); - const auto now = std::chrono::steady_clock::now(); - if (now - last_dump > std::chrono::seconds(2) - && remaining_.load(std::memory_order_acquire) != 0) { - std::fprintf(stderr, - "[TeamThreadPool] waiting too long for epoch=%zu (remaining=%zu)\n", - new_epoch, remaining_.load(std::memory_order_acquire)); - dump_state("TeamThreadPool(wait)"); - last_dump = now; - } - } -#else done_cv_.wait(lk, [&] { return remaining_.load(std::memory_order_acquire) == 0; }); -#endif // Hygiene. region_n_.store(0, std::memory_order_release); @@ -204,7 +132,7 @@ class TeamThreadPool { } static std::atomic& user_cap_() { - static std::atomic cap{0}; + static std::atomic cap{0}; // 0 => unset return cap; } @@ -226,11 +154,6 @@ class TeamThreadPool { return cap; } - std::size_t wake_gen_snapshot_() const { - std::lock_guard lk(wake_m_); - return wake_gen_; - } - TeamThreadPool() : stop_(false), epoch_(0), @@ -239,7 +162,6 @@ class TeamThreadPool { region_call_(nullptr), remaining_(0), exc_ptr_(nullptr), - wake_gen_(0), ready_count_(0) { unsigned hw_u = std::thread::hardware_concurrency(); if (hw_u == 0) hw_u = 2; @@ -248,94 +170,52 @@ class TeamThreadPool { const std::size_t cap = configured_cap_(hw); const std::size_t num_workers = (cap > 1) ? (cap - 1) : 0; - worker_state_size_ = cap; - - // raw arrays so atomics aren't moved - worker_alive_.reset(new std::atomic[cap]); - worker_seen_epoch_.reset(new std::atomic[cap]); - worker_exec_epoch_.reset(new std::atomic[cap]); - worker_decrement_count_.reset(new std::atomic[cap]); - - for (std::size_t i = 0; i < cap; ++i) { - worker_alive_[i].store(0u, std::memory_order_relaxed); - worker_seen_epoch_[i].store(0, std::memory_order_relaxed); - worker_exec_epoch_[i].store(0, std::memory_order_relaxed); - worker_decrement_count_[i].store(0, std::memory_order_relaxed); - } - - std::fprintf(stderr, - "[TeamThreadPool(ctor)] cap=%zu (workers=%zu) hw=%zu\n", - cap, num_workers, hw); - std::fflush(stderr); - workers_.reserve(num_workers); for (std::size_t i = 0; i < num_workers; ++i) { - const std::size_t tid = i + 1; + const std::size_t tid = i + 1; // workers are 1..num_workers workers_.emplace_back([this, tid] { + // Per-worker AD tape initialized once. static thread_local ChainableStack ad_tape; - in_worker_ = true; - - if (tid < worker_state_size_) { - worker_alive_[tid].store(1u, std::memory_order_release); - } + (void)ad_tape; - // "Arm" this worker on the current wake generation so it can't miss - // the first region wake. - std::size_t local_gen; - { - std::unique_lock lk(wake_m_); - local_gen = wake_gen_; - } + in_worker_ = true; - // Signal readiness AFTER arming. - ready_count_.fetch_add(1, std::memory_order_acq_rel); + // Startup barrier: ensure each worker has entered the wait loop once. { - std::lock_guard lk(ready_m_); + std::lock_guard lk(wake_m_); + ready_count_.fetch_add(1, std::memory_order_acq_rel); } ready_cv_.notify_one(); + std::size_t seen_epoch = epoch_.load(std::memory_order_acquire); + for (;;) { - // Wait for wake_gen_ to change (or stop_). + // Wait for a new epoch (or stop). { std::unique_lock lk(wake_m_); wake_cv_.wait(lk, [&] { return stop_.load(std::memory_order_acquire) - || wake_gen_ != local_gen; + || epoch_.load(std::memory_order_acquire) != seen_epoch; }); if (stop_.load(std::memory_order_acquire)) break; - local_gen = wake_gen_; - } - - // Observe epoch and region parameters after wake. - const std::size_t e = epoch_.load(std::memory_order_acquire); - - if (tid < worker_state_size_) { - worker_seen_epoch_[tid].store(e, std::memory_order_release); + seen_epoch = epoch_.load(std::memory_order_acquire); } const std::size_t n = region_n_.load(std::memory_order_acquire); - if (tid >= n) { - continue; - } + if (tid >= n) continue; // not participating this region // Always decrement once for participating workers. struct DoneGuard { std::atomic& rem; std::mutex& m; std::condition_variable& cv; - std::atomic& dec_count; ~DoneGuard() { - dec_count.fetch_add(1, std::memory_order_relaxed); if (rem.fetch_sub(1, std::memory_order_acq_rel) == 1) { std::lock_guard lk(m); cv.notify_one(); } } - } guard{remaining_, done_m_, done_cv_, worker_decrement_count_[tid]}; - - if (tid < worker_state_size_) { - worker_exec_epoch_[tid].store(e, std::memory_order_release); - } + } guard{remaining_, done_m_, done_cv_}; void* ctx = region_ctx_.load(std::memory_order_acquire); call_fn_t call = region_call_.load(std::memory_order_acquire); @@ -350,34 +230,27 @@ class TeamThreadPool { } } } - - if (tid < worker_state_size_) { - worker_exec_epoch_[tid].store(0, std::memory_order_release); - } } in_worker_ = false; }); } - // Startup barrier: ensure all workers are armed and waiting-ready. + // Wait for all workers to reach the wait loop once before returning. { - std::unique_lock lk(ready_m_); + std::unique_lock lk(wake_m_); ready_cv_.wait(lk, [&] { return ready_count_.load(std::memory_order_acquire) == workers_.size(); }); } - std::fprintf(stderr, "[TeamThreadPool(ctor)] all workers ready: %zu\n", - workers_.size()); - std::fflush(stderr); } ~TeamThreadPool() { stop_.store(true, std::memory_order_release); { - // bump wake_gen_ so workers wake and see stop_ std::lock_guard lk(wake_m_); - ++wake_gen_; + // bump epoch to ensure wake predicate flips + epoch_.fetch_add(1, std::memory_order_acq_rel); } wake_cv_.notify_all(); @@ -400,13 +273,11 @@ class TeamThreadPool { std::atomic region_ctx_; std::atomic region_call_; - // Wake workers (wake_gen_ is protected by wake_m_). - mutable std::mutex wake_m_; + // Wake workers. + std::mutex wake_m_; std::condition_variable wake_cv_; - std::size_t wake_gen_; // Startup barrier. - std::mutex ready_m_; std::condition_variable ready_cv_; std::atomic ready_count_; @@ -418,13 +289,6 @@ class TeamThreadPool { // Exceptions. std::mutex exc_m_; std::exception_ptr* exc_ptr_; - - // Debug state (arrays of atomics to avoid std::vector> move issues). - std::size_t worker_state_size_{0}; - std::unique_ptr[]> worker_alive_; - std::unique_ptr[]> worker_seen_epoch_; - std::unique_ptr[]> worker_exec_epoch_; - std::unique_ptr[]> worker_decrement_count_; }; } // namespace math From 1cc87bb4b14e27d51908c8c0a323d62d1348c20e Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Fri, 12 Dec 2025 21:12:42 -0500 Subject: [PATCH 08/17] updating make --- make/compiler_flags | 3 --- make/standalone | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/make/compiler_flags b/make/compiler_flags index 0febc5342de..f66d4687506 100644 --- a/make/compiler_flags +++ b/make/compiler_flags @@ -303,9 +303,6 @@ ifneq ($(OS), Windows_NT) LDFLAGS_TBB_RPATH ?= -Wl,-rpath,"$(TBB_LIB)" endif -LDFLAGS_TBB ?= -Wl,-L,"$(TBB_LIB)" $(LDFLAGS_TBB_DTAGS) $(LDFLAGS_TBB_RPATH) - -LDLIBS_TBB ?= -ltbb else diff --git a/make/standalone b/make/standalone index 1dfc9203af1..ca6ffc5b4fd 100644 --- a/make/standalone +++ b/make/standalone @@ -22,7 +22,7 @@ MATH ?= $(realpath $(MATH_MAKE)..)/ # The sundials libraries are only needed for # programs using the stiff ode solver or the # algebra solver -MATH_LIBS ?= $(SUNDIALS_TARGETS) $(MPI_TARGETS) $(TBB_TARGETS) +MATH_LIBS ?= $(SUNDIALS_TARGETS) $(MPI_TARGETS) LDLIBS += $(MATH_LIBS) From 1ada1de9d0bbff236ca3eac263d5df7b04e6cd97 Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Tue, 23 Dec 2025 12:03:22 -0500 Subject: [PATCH 09/17] map_rect --- stan/math/rev/functor/map_rect_concurrent.hpp | 53 +++++++------------ 1 file changed, 20 insertions(+), 33 deletions(-) diff --git a/stan/math/rev/functor/map_rect_concurrent.hpp b/stan/math/rev/functor/map_rect_concurrent.hpp index 4e32543adaf..b87344f5c7e 100644 --- a/stan/math/rev/functor/map_rect_concurrent.hpp +++ b/stan/math/rev/functor/map_rect_concurrent.hpp @@ -7,9 +7,10 @@ #include #include #include +#include -#include -#include +//#include +//#include #include #include @@ -34,7 +35,7 @@ map_rect_concurrent( = map_rect_reduce, T_job_param>; using CombineF = map_rect_combine; - const int num_jobs = job_params.size(); + const std::size_t num_jobs = job_params.size(); const vector_d shared_params_dbl = value_of(shared_params); std::vector job_output(num_jobs); std::vector world_f_out(num_jobs, 0); @@ -48,39 +49,25 @@ map_rect_concurrent( }; #ifdef STAN_THREADS - std::cout << "********************************************************************************" << std::endl; - if (num_jobs > 1) { - // simple chunked threading over [0, num_jobs) - unsigned hw_threads = std::thread::hardware_concurrency(); - if (hw_threads == 0) { - hw_threads = 2; // arbitrary but > 0 - } - - const unsigned max_threads - = static_cast(std::min(hw_threads, num_jobs)); - std::cout << "max_threads = " << max_threads << std::endl; - std::vector threads; - threads.reserve(max_threads); - - const std::size_t chunk - = (num_jobs + max_threads - 1) / max_threads; // ceil - - for (unsigned t = 0; t < max_threads; ++t) { - const std::size_t start = t * chunk; - if (start >= num_jobs) break; - const std::size_t end - = std::min(start + chunk, num_jobs); + auto& pool = stan::math::TeamThreadPool::instance(); - threads.emplace_back([&, start, end] { - execute_chunk(start, end); - }); - } + // Total participants includes caller (tid=0). + const std::size_t max_team = pool.team_size(); + const std::size_t n = std::min(max_team, + num_jobs == 0 ? 1u + : num_jobs); - for (auto& th : threads) { - th.join(); - } - } else { + if (n <= 1 || num_jobs <= 1) { execute_chunk(0, num_jobs); + } else { + pool.parallel_region(n, [&](std::size_t tid) { + const std::size_t nj = num_jobs; + const std::size_t b0 = (nj * tid) / n; + const std::size_t b1 = (nj * (tid + 1)) / n; + if (b0 < b1) { + execute_chunk(b0, b1); + } + }); } #else execute_chunk(0, num_jobs); From 28a32e82d0ceb5add78750ab439fd3f139ac0749 Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Tue, 23 Dec 2025 14:39:09 -0500 Subject: [PATCH 10/17] adding concurrent_vector --- stan/math/opencl/concurrent_vector.hpp | 226 +++++++++++++++++++++++++ stan/math/opencl/kernel_cl.hpp | 8 +- stan/math/opencl/matrix_cl.hpp | 12 +- stan/math/opencl/opencl_context.hpp | 4 +- 4 files changed, 238 insertions(+), 12 deletions(-) create mode 100644 stan/math/opencl/concurrent_vector.hpp diff --git a/stan/math/opencl/concurrent_vector.hpp b/stan/math/opencl/concurrent_vector.hpp new file mode 100644 index 00000000000..5daef1d8f7c --- /dev/null +++ b/stan/math/opencl/concurrent_vector.hpp @@ -0,0 +1,226 @@ +#ifndef STAN_MATH_OPENCL_CONCURRENT_VECTOR_HPP +#define STAN_MATH_OPENCL_CONCURRENT_VECTOR_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace stan { +namespace math { +namespace internal { + + /** + * Minimal segmented concurrent_vector. + * + * Key properties: + * - concurrent emplace_back/push_back using an atomic size counter. + * - segmented storage => no moving elements during growth, stable addresses. + * - segments allocated lazily; allocation uses CAS to avoid locks. + * + * Important constraints / notes: + * - operator[] is safe if you only read indices < size() that are known to be constructed. + * - For "publish-then-read" correctness: size() is updated before construction finishes. + * So consumers must not read index i just because i < size(); they must have a stronger + * protocol (e.g., producer hands out index, or you add a "constructed" bitmap). + * This matches common usage where only the pushing thread uses the returned index. + * - clear()/destruction are NOT concurrent with pushes. + * + * If you need "readers can iterate up to size() safely while writers push", + * add a constructed flag per element (see comment near emplace_back()). + */ + template + class concurrent_vector { + static_assert(BaseSegmentSize > 0, "BaseSegmentSize must be > 0"); + static_assert((BaseSegmentSize & (BaseSegmentSize - 1)) == 0, + "BaseSegmentSize must be a power of two (helps mapping)."); + public: + concurrent_vector() : size_(0) { + segments_.resize(MaxSegments); + for (auto& p : segments_) p.store(nullptr, std::memory_order_relaxed); + } + + concurrent_vector(const concurrent_vector&) = delete; + concurrent_vector& operator=(const concurrent_vector&) = delete; + + ~concurrent_vector() { destroy_all_(); } + + std::size_t size() const noexcept { + return size_.load(std::memory_order_acquire); + } + + bool empty() const noexcept { return size() == 0; } + + // Non-concurrent: safe only when no other threads are pushing/reading. + void clear() { + destroy_all_(); + size_.store(0, std::memory_order_release); + } + + // Concurrent append (construct in place). Returns the index. + template + std::size_t emplace_back(Args&&... args) { + // Claim an index + const std::size_t idx = size_.fetch_add(1, std::memory_order_acq_rel); + + // Ensure the segment exists + T* seg = ensure_segment_for_index_(idx); + + // Placement-new into the correct slot + const std::size_t off = offset_in_segment_(idx); + T* slot = seg + off; + + // Construct element + ::new (static_cast(slot)) T(std::forward(args)...); + + // If you need "safe iteration by other threads that use size()", + // you must publish construction completion separately, e.g.: + // constructed_[idx].store(true, release); + // and readers check constructed_[i].load(acquire). + return idx; + } + + std::size_t push_back(const T& v) { return emplace_back(v); } + std::size_t push_back(T&& v) { return emplace_back(std::move(v)); } + + // Returns pointer to element at i (no bounds check). + // Safe if element i is fully constructed and lifetime is valid. + T* data_at(std::size_t i) noexcept { + T* seg = segment_ptr_(segment_index_(i)); + return seg + offset_in_segment_(i); + } + const T* data_at(std::size_t i) const noexcept { + const T* seg = segment_ptr_(segment_index_(i)); + return seg + offset_in_segment_(i); + } + + // Bounds-checked access (still not concurrent-safe unless you have a protocol). + T& at(std::size_t i) { + if (i >= size()) throw std::out_of_range("concurrent_vector::at"); + return *data_at(i); + } + const T& at(std::size_t i) const { + if (i >= size()) throw std::out_of_range("concurrent_vector::at"); + return *data_at(i); + } + + // Unchecked access + T& operator[](std::size_t i) noexcept { return *data_at(i); } + const T& operator[](std::size_t i) const noexcept { return *data_at(i); } + + // Capacity is segmented and unbounded until MaxSegments is exceeded. + // This is the max number of elements representable by the segment scheme. + static constexpr std::size_t max_size() noexcept { + // Total capacity = Base * (2^MaxSegments - 1) + // but beware overflow for large MaxSegments. + return BaseSegmentSize * ((std::size_t{1} << MaxSegments) - 1); + } + + private: + // Segment k has size BaseSegmentSize * 2^k + static constexpr std::size_t segment_size_(std::size_t k) noexcept { + return BaseSegmentSize << k; + } + + // Prefix count before segment k: + // Base * (2^k - 1) + static constexpr std::size_t segment_prefix_(std::size_t k) noexcept { + return BaseSegmentSize * ((std::size_t{1} << k) - 1); + } + + // Map global index -> segment index. + // Let q = idx / Base. Then segment = floor(log2(q + 1)). + static std::size_t segment_index_(std::size_t idx) noexcept { + const std::size_t q = idx / BaseSegmentSize; + const std::size_t x = q + 1; + +#if defined(__GNUG__) || defined(__clang__) + // floor(log2(x)) via clz + return (sizeof(std::size_t) * 8 - 1) - static_cast(__builtin_clzl(x)); +#else + // portable fallback + std::size_t s = 0; + std::size_t t = x; + while (t >>= 1) ++s; + return s; +#endif + } + + static std::size_t offset_in_segment_(std::size_t idx) noexcept { + const std::size_t s = segment_index_(idx); + return idx - segment_prefix_(s); + } + + T* segment_ptr_(std::size_t s) noexcept { + return static_cast(segments_[s].load(std::memory_order_acquire)); + } + const T* segment_ptr_(std::size_t s) const noexcept { + return static_cast(segments_[s].load(std::memory_order_acquire)); + } + + T* ensure_segment_for_index_(std::size_t idx) { + const std::size_t s = segment_index_(idx); + if (s >= MaxSegments) { + throw std::length_error("concurrent_vector: exceeded MaxSegments"); + } + + T* seg = segment_ptr_(s); + if (seg) return seg; + + // Allocate segment lazily (raw storage for T objects) + const std::size_t n = segment_size_(s); + void* raw = ::operator new(sizeof(T) * n); + T* fresh = static_cast(raw); + + // CAS install; if another thread won, free ours. + void* expected = nullptr; + if (!segments_[s].compare_exchange_strong( + expected, fresh, + std::memory_order_release, + std::memory_order_acquire)) { + ::operator delete(raw); + seg = static_cast(segments_[s].load(std::memory_order_acquire)); + assert(seg != nullptr); + return seg; + } + + return fresh; + } + + // Destroy constructed elements and free segments. + // Not concurrent with pushes or reads. + void destroy_all_() noexcept { + const std::size_t n = size_.load(std::memory_order_acquire); + + // Destroy elements that were constructed. + // NOTE: This assumes indices [0, n) are all constructed. + // If you allow exceptions or partial construction, track constructed flags. + for (std::size_t i = 0; i < n; ++i) { + data_at(i)->~T(); + } + + // Free segments + for (std::size_t s = 0; s < segments_.size(); ++s) { + void* p = segments_[s].load(std::memory_order_acquire); + if (p) { + ::operator delete(p); + segments_[s].store(nullptr, std::memory_order_relaxed); + } + } + } + + std::atomic size_; + std::vector> segments_; + }; +} +} +} + +#endif diff --git a/stan/math/opencl/kernel_cl.hpp b/stan/math/opencl/kernel_cl.hpp index 8f0b6a66e7d..8905243d709 100644 --- a/stan/math/opencl/kernel_cl.hpp +++ b/stan/math/opencl/kernel_cl.hpp @@ -109,17 +109,17 @@ inline void assign_events(const cl::Event& new_event, CallArg& m, * @return A vector of OpenCL events. */ template * = nullptr> -inline tbb::concurrent_vector select_events(const T& m) { - return tbb::concurrent_vector{}; +inline internal::concurrent_vector select_events(const T& m) { + return internal::concurrent_vector{}; } template * = nullptr, require_same_t* = nullptr> -inline const tbb::concurrent_vector& select_events(const K& m) { +inline const internal::concurrent_vector& select_events(const K& m) { return m.write_events(); } template * = nullptr, require_any_same_t* = nullptr> -inline tbb::concurrent_vector select_events(K& m) { +inline internal::concurrent_vector select_events(K& m) { static_assert(!std::is_const::value, "Can not write to const matrix_cl!"); return m.read_write_events(); } diff --git a/stan/math/opencl/matrix_cl.hpp b/stan/math/opencl/matrix_cl.hpp index 4114c1199c8..476a688ce7d 100644 --- a/stan/math/opencl/matrix_cl.hpp +++ b/stan/math/opencl/matrix_cl.hpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include @@ -51,8 +51,8 @@ class matrix_cl : public matrix_cl_base { int cols_{0}; // Number of columns. // Holds info on if matrix is a special type matrix_cl_view view_{matrix_cl_view::Entire}; - mutable tbb::concurrent_vector write_events_; // Tracks write jobs - mutable tbb::concurrent_vector read_events_; // Tracks reads + mutable internal::concurrent_vector write_events_; // Tracks write jobs + mutable internal::concurrent_vector read_events_; // Tracks reads public: using Scalar = T; // Underlying type of the matrix @@ -100,7 +100,7 @@ class matrix_cl : public matrix_cl_base { * Get the events from the event stacks. * @return The write event stack. */ - inline const tbb::concurrent_vector& write_events() const { + inline const internal::concurrent_vector& write_events() const { return write_events_; } @@ -108,7 +108,7 @@ class matrix_cl : public matrix_cl_base { * Get the events from the event stacks. * @return The read/write event stack. */ - inline const tbb::concurrent_vector& read_events() const { + inline const internal::concurrent_vector& read_events() const { return read_events_; } @@ -116,7 +116,7 @@ class matrix_cl : public matrix_cl_base { * Get the events from the event stacks. * @return The read/write event stack. */ - inline const tbb::concurrent_vector read_write_events() const { + inline const internal::concurrent_vector read_write_events() const { return vec_concat(this->read_events(), this->write_events()); } diff --git a/stan/math/opencl/opencl_context.hpp b/stan/math/opencl/opencl_context.hpp index e2373df126d..b0c435e33e0 100644 --- a/stan/math/opencl/opencl_context.hpp +++ b/stan/math/opencl/opencl_context.hpp @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include #include @@ -208,7 +208,7 @@ class opencl_context_base { * The API to access the methods and values in opencl_context_base */ class opencl_context { - tbb::concurrent_vector kernel_caches_; + internal::concurrent_vector kernel_caches_; public: opencl_context() = default; From ac8a0de9d1bf78854ff4fe628bd63ef952c5b618 Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Tue, 23 Dec 2025 14:42:46 -0500 Subject: [PATCH 11/17] more changes --- stan/math/prim/functor/reduce_sum_static.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/stan/math/prim/functor/reduce_sum_static.hpp b/stan/math/prim/functor/reduce_sum_static.hpp index 7ad8648aad8..29f70f193d3 100644 --- a/stan/math/prim/functor/reduce_sum_static.hpp +++ b/stan/math/prim/functor/reduce_sum_static.hpp @@ -4,9 +4,6 @@ #include #include #include -#include -#include -#include #include #include From cc68a9f73fc339072accf68458492e62c0306115 Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Tue, 23 Dec 2025 15:12:21 -0500 Subject: [PATCH 12/17] fixed header --- stan/math/opencl/matrix_cl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stan/math/opencl/matrix_cl.hpp b/stan/math/opencl/matrix_cl.hpp index 476a688ce7d..4ebc84a1b0a 100644 --- a/stan/math/opencl/matrix_cl.hpp +++ b/stan/math/opencl/matrix_cl.hpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include From 593440ffe2a51257cc6c263a9729256ab0e17ea0 Mon Sep 17 00:00:00 2001 From: Stan Jenkins Date: Tue, 23 Dec 2025 15:17:06 -0500 Subject: [PATCH 13/17] [Jenkins] auto-formatting by clang-format version 10.0.0-4ubuntu1 --- stan/math/opencl/concurrent_vector.hpp | 408 +++++++++--------- stan/math/opencl/matrix_cl.hpp | 8 +- stan/math/rev/core/simple_thread_pool.hpp | 19 +- stan/math/rev/core/team_thread_pool.hpp | 57 ++- stan/math/rev/functor/map_rect_concurrent.hpp | 5 +- stan/math/rev/functor/reduce_sum.hpp | 104 +++-- 6 files changed, 322 insertions(+), 279 deletions(-) diff --git a/stan/math/opencl/concurrent_vector.hpp b/stan/math/opencl/concurrent_vector.hpp index 5daef1d8f7c..46b3e448f19 100644 --- a/stan/math/opencl/concurrent_vector.hpp +++ b/stan/math/opencl/concurrent_vector.hpp @@ -15,212 +15,220 @@ namespace stan { namespace math { namespace internal { - /** - * Minimal segmented concurrent_vector. - * - * Key properties: - * - concurrent emplace_back/push_back using an atomic size counter. - * - segmented storage => no moving elements during growth, stable addresses. - * - segments allocated lazily; allocation uses CAS to avoid locks. - * - * Important constraints / notes: - * - operator[] is safe if you only read indices < size() that are known to be constructed. - * - For "publish-then-read" correctness: size() is updated before construction finishes. - * So consumers must not read index i just because i < size(); they must have a stronger - * protocol (e.g., producer hands out index, or you add a "constructed" bitmap). - * This matches common usage where only the pushing thread uses the returned index. - * - clear()/destruction are NOT concurrent with pushes. - * - * If you need "readers can iterate up to size() safely while writers push", - * add a constructed flag per element (see comment near emplace_back()). - */ - template - class concurrent_vector { - static_assert(BaseSegmentSize > 0, "BaseSegmentSize must be > 0"); - static_assert((BaseSegmentSize & (BaseSegmentSize - 1)) == 0, - "BaseSegmentSize must be a power of two (helps mapping)."); - public: - concurrent_vector() : size_(0) { - segments_.resize(MaxSegments); - for (auto& p : segments_) p.store(nullptr, std::memory_order_relaxed); - } - - concurrent_vector(const concurrent_vector&) = delete; - concurrent_vector& operator=(const concurrent_vector&) = delete; - - ~concurrent_vector() { destroy_all_(); } - - std::size_t size() const noexcept { - return size_.load(std::memory_order_acquire); - } - - bool empty() const noexcept { return size() == 0; } - - // Non-concurrent: safe only when no other threads are pushing/reading. - void clear() { - destroy_all_(); - size_.store(0, std::memory_order_release); - } - - // Concurrent append (construct in place). Returns the index. - template - std::size_t emplace_back(Args&&... args) { - // Claim an index - const std::size_t idx = size_.fetch_add(1, std::memory_order_acq_rel); - - // Ensure the segment exists - T* seg = ensure_segment_for_index_(idx); - - // Placement-new into the correct slot - const std::size_t off = offset_in_segment_(idx); - T* slot = seg + off; - - // Construct element - ::new (static_cast(slot)) T(std::forward(args)...); - - // If you need "safe iteration by other threads that use size()", - // you must publish construction completion separately, e.g.: - // constructed_[idx].store(true, release); - // and readers check constructed_[i].load(acquire). - return idx; - } - - std::size_t push_back(const T& v) { return emplace_back(v); } - std::size_t push_back(T&& v) { return emplace_back(std::move(v)); } - - // Returns pointer to element at i (no bounds check). - // Safe if element i is fully constructed and lifetime is valid. - T* data_at(std::size_t i) noexcept { - T* seg = segment_ptr_(segment_index_(i)); - return seg + offset_in_segment_(i); - } - const T* data_at(std::size_t i) const noexcept { - const T* seg = segment_ptr_(segment_index_(i)); - return seg + offset_in_segment_(i); - } - - // Bounds-checked access (still not concurrent-safe unless you have a protocol). - T& at(std::size_t i) { - if (i >= size()) throw std::out_of_range("concurrent_vector::at"); - return *data_at(i); - } - const T& at(std::size_t i) const { - if (i >= size()) throw std::out_of_range("concurrent_vector::at"); - return *data_at(i); - } - - // Unchecked access - T& operator[](std::size_t i) noexcept { return *data_at(i); } - const T& operator[](std::size_t i) const noexcept { return *data_at(i); } - - // Capacity is segmented and unbounded until MaxSegments is exceeded. - // This is the max number of elements representable by the segment scheme. - static constexpr std::size_t max_size() noexcept { - // Total capacity = Base * (2^MaxSegments - 1) - // but beware overflow for large MaxSegments. - return BaseSegmentSize * ((std::size_t{1} << MaxSegments) - 1); - } - - private: - // Segment k has size BaseSegmentSize * 2^k - static constexpr std::size_t segment_size_(std::size_t k) noexcept { - return BaseSegmentSize << k; - } - - // Prefix count before segment k: - // Base * (2^k - 1) - static constexpr std::size_t segment_prefix_(std::size_t k) noexcept { - return BaseSegmentSize * ((std::size_t{1} << k) - 1); - } - - // Map global index -> segment index. - // Let q = idx / Base. Then segment = floor(log2(q + 1)). - static std::size_t segment_index_(std::size_t idx) noexcept { - const std::size_t q = idx / BaseSegmentSize; - const std::size_t x = q + 1; +/** + * Minimal segmented concurrent_vector. + * + * Key properties: + * - concurrent emplace_back/push_back using an atomic size counter. + * - segmented storage => no moving elements during growth, stable addresses. + * - segments allocated lazily; allocation uses CAS to avoid locks. + * + * Important constraints / notes: + * - operator[] is safe if you only read indices < size() that are known to be + * constructed. + * - For "publish-then-read" correctness: size() is updated before construction + * finishes. So consumers must not read index i just because i < size(); they + * must have a stronger protocol (e.g., producer hands out index, or you add a + * "constructed" bitmap). This matches common usage where only the pushing + * thread uses the returned index. + * - clear()/destruction are NOT concurrent with pushes. + * + * If you need "readers can iterate up to size() safely while writers push", + * add a constructed flag per element (see comment near emplace_back()). + */ +template +class concurrent_vector { + static_assert(BaseSegmentSize > 0, "BaseSegmentSize must be > 0"); + static_assert((BaseSegmentSize & (BaseSegmentSize - 1)) == 0, + "BaseSegmentSize must be a power of two (helps mapping)."); + + public: + concurrent_vector() : size_(0) { + segments_.resize(MaxSegments); + for (auto& p : segments_) + p.store(nullptr, std::memory_order_relaxed); + } + + concurrent_vector(const concurrent_vector&) = delete; + concurrent_vector& operator=(const concurrent_vector&) = delete; + + ~concurrent_vector() { destroy_all_(); } + + std::size_t size() const noexcept { + return size_.load(std::memory_order_acquire); + } + + bool empty() const noexcept { return size() == 0; } + + // Non-concurrent: safe only when no other threads are pushing/reading. + void clear() { + destroy_all_(); + size_.store(0, std::memory_order_release); + } + + // Concurrent append (construct in place). Returns the index. + template + std::size_t emplace_back(Args&&... args) { + // Claim an index + const std::size_t idx = size_.fetch_add(1, std::memory_order_acq_rel); + + // Ensure the segment exists + T* seg = ensure_segment_for_index_(idx); + + // Placement-new into the correct slot + const std::size_t off = offset_in_segment_(idx); + T* slot = seg + off; + + // Construct element + ::new (static_cast(slot)) T(std::forward(args)...); + + // If you need "safe iteration by other threads that use size()", + // you must publish construction completion separately, e.g.: + // constructed_[idx].store(true, release); + // and readers check constructed_[i].load(acquire). + return idx; + } + + std::size_t push_back(const T& v) { return emplace_back(v); } + std::size_t push_back(T&& v) { return emplace_back(std::move(v)); } + + // Returns pointer to element at i (no bounds check). + // Safe if element i is fully constructed and lifetime is valid. + T* data_at(std::size_t i) noexcept { + T* seg = segment_ptr_(segment_index_(i)); + return seg + offset_in_segment_(i); + } + const T* data_at(std::size_t i) const noexcept { + const T* seg = segment_ptr_(segment_index_(i)); + return seg + offset_in_segment_(i); + } + + // Bounds-checked access (still not concurrent-safe unless you have a + // protocol). + T& at(std::size_t i) { + if (i >= size()) + throw std::out_of_range("concurrent_vector::at"); + return *data_at(i); + } + const T& at(std::size_t i) const { + if (i >= size()) + throw std::out_of_range("concurrent_vector::at"); + return *data_at(i); + } + + // Unchecked access + T& operator[](std::size_t i) noexcept { return *data_at(i); } + const T& operator[](std::size_t i) const noexcept { return *data_at(i); } + + // Capacity is segmented and unbounded until MaxSegments is exceeded. + // This is the max number of elements representable by the segment scheme. + static constexpr std::size_t max_size() noexcept { + // Total capacity = Base * (2^MaxSegments - 1) + // but beware overflow for large MaxSegments. + return BaseSegmentSize * ((std::size_t{1} << MaxSegments) - 1); + } + + private: + // Segment k has size BaseSegmentSize * 2^k + static constexpr std::size_t segment_size_(std::size_t k) noexcept { + return BaseSegmentSize << k; + } + + // Prefix count before segment k: + // Base * (2^k - 1) + static constexpr std::size_t segment_prefix_(std::size_t k) noexcept { + return BaseSegmentSize * ((std::size_t{1} << k) - 1); + } + + // Map global index -> segment index. + // Let q = idx / Base. Then segment = floor(log2(q + 1)). + static std::size_t segment_index_(std::size_t idx) noexcept { + const std::size_t q = idx / BaseSegmentSize; + const std::size_t x = q + 1; #if defined(__GNUG__) || defined(__clang__) - // floor(log2(x)) via clz - return (sizeof(std::size_t) * 8 - 1) - static_cast(__builtin_clzl(x)); + // floor(log2(x)) via clz + return (sizeof(std::size_t) * 8 - 1) + - static_cast(__builtin_clzl(x)); #else - // portable fallback - std::size_t s = 0; - std::size_t t = x; - while (t >>= 1) ++s; - return s; + // portable fallback + std::size_t s = 0; + std::size_t t = x; + while (t >>= 1) + ++s; + return s; #endif - } - - static std::size_t offset_in_segment_(std::size_t idx) noexcept { - const std::size_t s = segment_index_(idx); - return idx - segment_prefix_(s); - } - - T* segment_ptr_(std::size_t s) noexcept { - return static_cast(segments_[s].load(std::memory_order_acquire)); - } - const T* segment_ptr_(std::size_t s) const noexcept { - return static_cast(segments_[s].load(std::memory_order_acquire)); - } - - T* ensure_segment_for_index_(std::size_t idx) { - const std::size_t s = segment_index_(idx); - if (s >= MaxSegments) { - throw std::length_error("concurrent_vector: exceeded MaxSegments"); - } - - T* seg = segment_ptr_(s); - if (seg) return seg; - - // Allocate segment lazily (raw storage for T objects) - const std::size_t n = segment_size_(s); - void* raw = ::operator new(sizeof(T) * n); - T* fresh = static_cast(raw); - - // CAS install; if another thread won, free ours. - void* expected = nullptr; - if (!segments_[s].compare_exchange_strong( - expected, fresh, - std::memory_order_release, - std::memory_order_acquire)) { - ::operator delete(raw); - seg = static_cast(segments_[s].load(std::memory_order_acquire)); - assert(seg != nullptr); - return seg; - } - - return fresh; - } - - // Destroy constructed elements and free segments. - // Not concurrent with pushes or reads. - void destroy_all_() noexcept { - const std::size_t n = size_.load(std::memory_order_acquire); - - // Destroy elements that were constructed. - // NOTE: This assumes indices [0, n) are all constructed. - // If you allow exceptions or partial construction, track constructed flags. - for (std::size_t i = 0; i < n; ++i) { - data_at(i)->~T(); - } - - // Free segments - for (std::size_t s = 0; s < segments_.size(); ++s) { - void* p = segments_[s].load(std::memory_order_acquire); - if (p) { - ::operator delete(p); - segments_[s].store(nullptr, std::memory_order_relaxed); - } + } + + static std::size_t offset_in_segment_(std::size_t idx) noexcept { + const std::size_t s = segment_index_(idx); + return idx - segment_prefix_(s); + } + + T* segment_ptr_(std::size_t s) noexcept { + return static_cast(segments_[s].load(std::memory_order_acquire)); + } + const T* segment_ptr_(std::size_t s) const noexcept { + return static_cast(segments_[s].load(std::memory_order_acquire)); + } + + T* ensure_segment_for_index_(std::size_t idx) { + const std::size_t s = segment_index_(idx); + if (s >= MaxSegments) { + throw std::length_error("concurrent_vector: exceeded MaxSegments"); + } + + T* seg = segment_ptr_(s); + if (seg) + return seg; + + // Allocate segment lazily (raw storage for T objects) + const std::size_t n = segment_size_(s); + void* raw = ::operator new(sizeof(T) * n); + T* fresh = static_cast(raw); + + // CAS install; if another thread won, free ours. + void* expected = nullptr; + if (!segments_[s].compare_exchange_strong(expected, fresh, + std::memory_order_release, + std::memory_order_acquire)) { + ::operator delete(raw); + seg = static_cast(segments_[s].load(std::memory_order_acquire)); + assert(seg != nullptr); + return seg; + } + + return fresh; + } + + // Destroy constructed elements and free segments. + // Not concurrent with pushes or reads. + void destroy_all_() noexcept { + const std::size_t n = size_.load(std::memory_order_acquire); + + // Destroy elements that were constructed. + // NOTE: This assumes indices [0, n) are all constructed. + // If you allow exceptions or partial construction, track constructed flags. + for (std::size_t i = 0; i < n; ++i) { + data_at(i)->~T(); + } + + // Free segments + for (std::size_t s = 0; s < segments_.size(); ++s) { + void* p = segments_[s].load(std::memory_order_acquire); + if (p) { + ::operator delete(p); + segments_[s].store(nullptr, std::memory_order_relaxed); } } + } - std::atomic size_; - std::vector> segments_; - }; -} -} -} + std::atomic size_; + std::vector> segments_; +}; +} // namespace internal +} // namespace math +} // namespace stan #endif diff --git a/stan/math/opencl/matrix_cl.hpp b/stan/math/opencl/matrix_cl.hpp index 4ebc84a1b0a..3d3a4c6e436 100644 --- a/stan/math/opencl/matrix_cl.hpp +++ b/stan/math/opencl/matrix_cl.hpp @@ -51,8 +51,9 @@ class matrix_cl : public matrix_cl_base { int cols_{0}; // Number of columns. // Holds info on if matrix is a special type matrix_cl_view view_{matrix_cl_view::Entire}; - mutable internal::concurrent_vector write_events_; // Tracks write jobs - mutable internal::concurrent_vector read_events_; // Tracks reads + mutable internal::concurrent_vector + write_events_; // Tracks write jobs + mutable internal::concurrent_vector read_events_; // Tracks reads public: using Scalar = T; // Underlying type of the matrix @@ -116,7 +117,8 @@ class matrix_cl : public matrix_cl_base { * Get the events from the event stacks. * @return The read/write event stack. */ - inline const internal::concurrent_vector read_write_events() const { + inline const internal::concurrent_vector read_write_events() + const { return vec_concat(this->read_events(), this->write_events()); } diff --git a/stan/math/rev/core/simple_thread_pool.hpp b/stan/math/rev/core/simple_thread_pool.hpp index 84e1adf7c55..16749ee1e5b 100644 --- a/stan/math/rev/core/simple_thread_pool.hpp +++ b/stan/math/rev/core/simple_thread_pool.hpp @@ -19,7 +19,7 @@ namespace stan { namespace math { class SimpleThreadPool { -public: + public: static SimpleThreadPool& instance() { static SimpleThreadPool pool; return pool; @@ -44,7 +44,8 @@ class SimpleThreadPool { template void parallel_region(std::size_t n, F&& fn) { - if (n == 0) return; + if (n == 0) + return; // Avoid nested parallelism deadlocks/oversubscription. if (in_worker_) { @@ -57,7 +58,8 @@ class SimpleThreadPool { fn(std::size_t{0}); return; } - if (n > tc) n = tc; + if (n > tc) + n = tc; using Fn = std::decay_t; struct Shared { @@ -86,10 +88,11 @@ class SimpleThreadPool { }); } -private: + private: SimpleThreadPool() : done_(false) { unsigned hw = std::thread::hardware_concurrency(); - if (hw == 0) hw = 2; + if (hw == 0) + hw = 2; const unsigned num_threads = hw; workers_.reserve(num_threads); @@ -103,7 +106,8 @@ class SimpleThreadPool { { std::unique_lock lock(mtx_); cv_.wait(lock, [&] { return done_ || !tasks_.empty(); }); - if (done_ && tasks_.empty()) return; + if (done_ && tasks_.empty()) + return; task = std::move(tasks_.front()); tasks_.pop(); } @@ -122,7 +126,8 @@ class SimpleThreadPool { } cv_.notify_all(); for (auto& th : workers_) { - if (th.joinable()) th.join(); + if (th.joinable()) + th.join(); } } diff --git a/stan/math/rev/core/team_thread_pool.hpp b/stan/math/rev/core/team_thread_pool.hpp index 538006bde81..2a77ff644b9 100644 --- a/stan/math/rev/core/team_thread_pool.hpp +++ b/stan/math/rev/core/team_thread_pool.hpp @@ -28,13 +28,15 @@ namespace math { * Notes: * - Nested parallel_region calls from a worker run serially to avoid deadlock. * - Uses an epoch counter + condition_variable to wake workers per region. - * - Startup barrier ensures all workers are waiting before the first region launch. + * - Startup barrier ensures all workers are waiting before the first region + * launch. */ class TeamThreadPool { public: // Total participants INCLUDING caller (tid=0). Call before instance(). static void set_num_threads(std::size_t n) noexcept { - if (n < 1) n = 1; + if (n < 1) + n = 1; user_cap_().store(n, std::memory_order_release); } @@ -52,7 +54,8 @@ class TeamThreadPool { template void parallel_region(std::size_t n, F&& fn) { - if (n == 0) return; + if (n == 0) + return; // Prevent nested parallelism from deadlocking the pool. if (in_worker_) { @@ -68,7 +71,8 @@ class TeamThreadPool { fn(std::size_t{0}); return; } - if (n > max_team) n = max_team; + if (n > max_team) + n = max_team; if (n == 1) { fn(std::size_t{0}); return; @@ -91,8 +95,8 @@ class TeamThreadPool { region_call_.store(&call_impl, std::memory_order_release); // Bump epoch to start the region, then wake workers. - const std::size_t new_epoch = - epoch_.fetch_add(1, std::memory_order_acq_rel) + 1; + const std::size_t new_epoch + = epoch_.fetch_add(1, std::memory_order_acq_rel) + 1; { std::lock_guard lk(wake_m_); @@ -107,20 +111,21 @@ class TeamThreadPool { fn_copy(0); } catch (...) { std::lock_guard lk(exc_m_); - if (eptr == nullptr) eptr = std::current_exception(); + if (eptr == nullptr) + eptr = std::current_exception(); } in_worker_ = false; // Wait for workers 1..n-1. std::unique_lock lk(done_m_); - done_cv_.wait(lk, [&] { - return remaining_.load(std::memory_order_acquire) == 0; - }); + done_cv_.wait( + lk, [&] { return remaining_.load(std::memory_order_acquire) == 0; }); // Hygiene. region_n_.store(0, std::memory_order_release); - if (eptr) std::rethrow_exception(eptr); + if (eptr) + std::rethrow_exception(eptr); } private: @@ -138,19 +143,25 @@ class TeamThreadPool { static std::size_t env_num_threads_() noexcept { const char* s = std::getenv("STAN_NUM_THREADS"); - if (!s || !*s) return 0; + if (!s || !*s) + return 0; char* end = nullptr; long v = std::strtol(s, &end, 10); - if (end == s || v <= 0) return 0; + if (end == s || v <= 0) + return 0; return static_cast(v); } static std::size_t configured_cap_(std::size_t hw) noexcept { std::size_t cap = user_cap_().load(std::memory_order_acquire); - if (cap == 0) cap = env_num_threads_(); - if (cap == 0) cap = hw; - if (cap < 1) cap = 1; - if (cap > hw) cap = hw; + if (cap == 0) + cap = env_num_threads_(); + if (cap == 0) + cap = hw; + if (cap < 1) + cap = 1; + if (cap > hw) + cap = hw; return cap; } @@ -164,7 +175,8 @@ class TeamThreadPool { exc_ptr_(nullptr), ready_count_(0) { unsigned hw_u = std::thread::hardware_concurrency(); - if (hw_u == 0) hw_u = 2; + if (hw_u == 0) + hw_u = 2; const std::size_t hw = static_cast(hw_u); const std::size_t cap = configured_cap_(hw); @@ -197,12 +209,14 @@ class TeamThreadPool { return stop_.load(std::memory_order_acquire) || epoch_.load(std::memory_order_acquire) != seen_epoch; }); - if (stop_.load(std::memory_order_acquire)) break; + if (stop_.load(std::memory_order_acquire)) + break; seen_epoch = epoch_.load(std::memory_order_acquire); } const std::size_t n = region_n_.load(std::memory_order_acquire); - if (tid >= n) continue; // not participating this region + if (tid >= n) + continue; // not participating this region // Always decrement once for participating workers. struct DoneGuard { @@ -255,7 +269,8 @@ class TeamThreadPool { wake_cv_.notify_all(); for (auto& t : workers_) { - if (t.joinable()) t.join(); + if (t.joinable()) + t.join(); } } diff --git a/stan/math/rev/functor/map_rect_concurrent.hpp b/stan/math/rev/functor/map_rect_concurrent.hpp index b87344f5c7e..76d38866885 100644 --- a/stan/math/rev/functor/map_rect_concurrent.hpp +++ b/stan/math/rev/functor/map_rect_concurrent.hpp @@ -53,9 +53,8 @@ map_rect_concurrent( // Total participants includes caller (tid=0). const std::size_t max_team = pool.team_size(); - const std::size_t n = std::min(max_team, - num_jobs == 0 ? 1u - : num_jobs); + const std::size_t n + = std::min(max_team, num_jobs == 0 ? 1u : num_jobs); if (n <= 1 || num_jobs <= 1) { execute_chunk(0, num_jobs); diff --git a/stan/math/rev/functor/reduce_sum.hpp b/stan/math/rev/functor/reduce_sum.hpp index f3b9dd0136c..814bc6e3f59 100644 --- a/stan/math/rev/functor/reduce_sum.hpp +++ b/stan/math/rev/functor/reduce_sum.hpp @@ -21,12 +21,12 @@ namespace internal { template -struct reduce_sum_impl, ReturnType, Vec, - Args...> { +struct reduce_sum_impl, ReturnType, + Vec, Args...> { struct scoped_args_tuple { ScopedChainableStack stack_; - using args_tuple_t = - std::tuple>()))...>; + using args_tuple_t = std::tuple>()))...>; std::unique_ptr args_tuple_holder_; scoped_args_tuple() : stack_(), args_tuple_holder_(nullptr) {} }; @@ -70,21 +70,21 @@ struct reduce_sum_impl, ReturnType, Ve recursive_reducer(std::size_t num_vars_per_term, std::size_t num_vars_shared_terms, - double* sliced_partials, - const VecRef& vmapped, - std::ostream* msgs, - const std::decay_t&... args) + double* sliced_partials, const VecRef& vmapped, + std::ostream* msgs, const std::decay_t&... args) : num_vars_per_term_(num_vars_per_term), num_vars_shared_terms_(num_vars_shared_terms), sliced_partials_(sliced_partials), vmapped_(&vmapped), args_ptrs_(&args...), msgs_out_(msgs) { - if (msgs_out_) msgs_ = std::make_unique(); + if (msgs_out_) + msgs_ = std::make_unique(); } inline void operator()(std::size_t begin, std::size_t end) { - if (begin >= end) return; + if (begin >= end) + return; if (args_adjoints_.empty()) { args_adjoints_.assign(num_vars_shared_terms_, 0.0); @@ -92,11 +92,13 @@ struct reduce_sum_impl, ReturnType, Ve if (!local_args_tuple_scope_.args_tuple_holder_) { local_args_tuple_scope_.stack_.execute([&]() { - apply_ptr_tuple([&](auto const&... a) { - local_args_tuple_scope_.args_tuple_holder_ = - std::make_unique( + apply_ptr_tuple( + [&](auto const&... a) { + local_args_tuple_scope_.args_tuple_holder_ = std::make_unique< + typename scoped_args_tuple::args_tuple_t>( deep_copy_vars(a)...); - }, args_ptrs_); + }, + args_ptrs_); }); } else { local_args_tuple_scope_.stack_.execute([] { set_zero_all_adjoints(); }); @@ -118,13 +120,13 @@ struct reduce_sum_impl, ReturnType, Ve local_sub_slice_.emplace_back(deep_copy_vars((*vmapped_)[i])); } - std::ostream* local_msgs = - msgs_ ? static_cast(msgs_.get()) : nullptr; + std::ostream* local_msgs + = msgs_ ? static_cast(msgs_.get()) : nullptr; var sub_sum_v = math::apply( [&](auto&&... args_local) { - return ReduceFunction()(local_sub_slice_, begin, end - 1, local_msgs, - args_local...); + return ReduceFunction()(local_sub_slice_, begin, end - 1, + local_msgs, args_local...); }, args_tuple_local); @@ -145,43 +147,48 @@ struct reduce_sum_impl, ReturnType, Ve } }; - inline var operator()(Vec&& vmapped, bool /*auto_partitioning*/, int grainsize, - std::ostream* msgs, Args&&... args) const { - if (vmapped.empty()) return var(0.0); + inline var operator()(Vec&& vmapped, bool /*auto_partitioning*/, + int grainsize, std::ostream* msgs, + Args&&... args) const { + if (vmapped.empty()) + return var(0.0); const std::size_t num_terms = vmapped.size(); const std::size_t num_vars_per_term = count_vars(vmapped[0]); const std::size_t num_vars_sliced_terms = num_terms * num_vars_per_term; const std::size_t num_vars_shared_terms = count_vars(args...); - vari** varis = - ChainableStack::instance_->memalloc_.alloc_array( - num_vars_sliced_terms + num_vars_shared_terms); - double* partials = - ChainableStack::instance_->memalloc_.alloc_array( - num_vars_sliced_terms + num_vars_shared_terms); + vari** varis = ChainableStack::instance_->memalloc_.alloc_array( + num_vars_sliced_terms + num_vars_shared_terms); + double* partials = ChainableStack::instance_->memalloc_.alloc_array( + num_vars_sliced_terms + num_vars_shared_terms); save_varis(varis, vmapped); save_varis(varis + num_vars_sliced_terms, args...); - for (std::size_t i = 0; i < num_vars_sliced_terms; ++i) partials[i] = 0.0; + for (std::size_t i = 0; i < num_vars_sliced_terms; ++i) + partials[i] = 0.0; auto& pool = stan::math::TeamThreadPool::instance(); const std::size_t max_team = pool.team_size(); // Choose workers. (Caller participates, so total participants = n) - std::size_t n = std::min(max_team, num_terms == 0 ? 1 : num_terms); - if (n < 1) n = 1; + std::size_t n + = std::min(max_team, num_terms == 0 ? 1 : num_terms); + if (n < 1) + n = 1; // Chunking: default to ~2 chunks per participant (lower overhead). std::size_t gs; if (grainsize > 0) { gs = static_cast(grainsize); - if (gs < 1) gs = 1; + if (gs < 1) + gs = 1; } else { const std::size_t target_chunks = n * 2; gs = (num_terms + target_chunks - 1) / target_chunks; - if (gs < 1) gs = 1; + if (gs < 1) + gs = 1; } // Serial cutoff: if too few terms, don't parallelize. @@ -201,29 +208,34 @@ struct reduce_sum_impl, ReturnType, Ve } } - if (msgs && r.msgs_) *msgs << r.msgs_->str(); + if (msgs && r.msgs_) + *msgs << r.msgs_->str(); return var(new precomputed_gradients_vari( - r.sum_, num_vars_sliced_terms + num_vars_shared_terms, varis, partials)); + r.sum_, num_vars_sliced_terms + num_vars_shared_terms, varis, + partials)); } // One reducer per participant (0..n-1) for static partitioning. - // NOTE: we avoid copying vmapped/args by taking references/pointers inside reducer. + // NOTE: we avoid copying vmapped/args by taking references/pointers inside + // reducer. std::vector> workers; workers.reserve(n); for (std::size_t tid = 0; tid < n; ++tid) { workers.emplace_back(std::make_unique( - num_vars_per_term, num_vars_shared_terms, partials, - vmapped, msgs, args...)); + num_vars_per_term, num_vars_shared_terms, partials, vmapped, msgs, + args...)); } /* - std::cout << "--------------------------------------------------------------------------------" << std::endl - << "worker count = " << pool.worker_count() << std::endl - << "team size = " << pool.team_size() << std::endl - << "gs = " << gs << std::endl - << std::endl << std::endl; + std::cout << + "--------------------------------------------------------------------------------" + << std::endl + << "worker count = " << pool.worker_count() << std::endl + << "team size = " << pool.team_size() << std::endl + << "gs = " << gs << std::endl + << std::endl << std::endl; */ - + // Static partition: each participant gets a contiguous block once pool.parallel_region(n, [&](std::size_t tid) { const std::size_t b0 = (num_terms * tid) / n; @@ -255,10 +267,12 @@ struct reduce_sum_impl, ReturnType, Ve partials[num_vars_sliced_terms + i] = shared_adj[i]; } - if (msgs) *msgs << all_msgs.str(); + if (msgs) + *msgs << all_msgs.str(); return var(new precomputed_gradients_vari( - total_sum, num_vars_sliced_terms + num_vars_shared_terms, varis, partials)); + total_sum, num_vars_sliced_terms + num_vars_shared_terms, varis, + partials)); } }; From 9d4a4aec2417086085f333c535eac29fdc2b69d0 Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Tue, 23 Dec 2025 21:38:58 -0500 Subject: [PATCH 14/17] updating concurrent_vector --- stan/math/opencl/concurrent_vector.hpp | 290 +++++++++++++++++-------- stan/math/opencl/kernel_cl.hpp | 8 +- stan/math/opencl/opencl_context.hpp | 7 +- 3 files changed, 206 insertions(+), 99 deletions(-) diff --git a/stan/math/opencl/concurrent_vector.hpp b/stan/math/opencl/concurrent_vector.hpp index 46b3e448f19..095643daf61 100644 --- a/stan/math/opencl/concurrent_vector.hpp +++ b/stan/math/opencl/concurrent_vector.hpp @@ -2,58 +2,77 @@ #define STAN_MATH_OPENCL_CONCURRENT_VECTOR_HPP #include +#include +#include #include #include +#include #include +#include #include #include -#include -#include -#include namespace stan { namespace math { namespace internal { /** - * Minimal segmented concurrent_vector. - * - * Key properties: - * - concurrent emplace_back/push_back using an atomic size counter. - * - segmented storage => no moving elements during growth, stable addresses. - * - segments allocated lazily; allocation uses CAS to avoid locks. + * Segmented concurrent_vector. * - * Important constraints / notes: - * - operator[] is safe if you only read indices < size() that are known to be - * constructed. - * - For "publish-then-read" correctness: size() is updated before construction - * finishes. So consumers must not read index i just because i < size(); they - * must have a stronger protocol (e.g., producer hands out index, or you add a - * "constructed" bitmap). This matches common usage where only the pushing - * thread uses the returned index. - * - clear()/destruction are NOT concurrent with pushes. + * Properties: + * - concurrent emplace_back/push_back via atomic size counter + * - segmented storage => no moving elements during growth, stable addresses + * - segments allocated lazily; allocation uses CAS to avoid locks + * - movable (required for return-by-value usage such as vec_concat) * - * If you need "readers can iterate up to size() safely while writers push", - * add a constructed flag per element (see comment near emplace_back()). + * Notes: + * - Intended for append-then-read patterns. + * - size_ increments before construction finishes. If readers iterate up to size() + * concurrently with writers, you need a constructed/published protocol. + * - clear()/destruction are NOT concurrent with pushes/reads. */ template class concurrent_vector { static_assert(BaseSegmentSize > 0, "BaseSegmentSize must be > 0"); static_assert((BaseSegmentSize & (BaseSegmentSize - 1)) == 0, - "BaseSegmentSize must be a power of two (helps mapping)."); + "BaseSegmentSize must be a power of two"); + static_assert(MaxSegments > 0, "MaxSegments must be > 0"); public: - concurrent_vector() : size_(0) { - segments_.resize(MaxSegments); - for (auto& p : segments_) - p.store(nullptr, std::memory_order_relaxed); + concurrent_vector() noexcept : size_(0) { + for (auto& p : segments_) p.store(nullptr, std::memory_order_relaxed); } - concurrent_vector(const concurrent_vector&) = delete; - concurrent_vector& operator=(const concurrent_vector&) = delete; + concurrent_vector(const concurrent_vector& other) : concurrent_vector() { + copy_from_(other); + } - ~concurrent_vector() { destroy_all_(); } + concurrent_vector& operator=(const concurrent_vector& other) { + if (this != &other) { + clear(); + copy_from_(other); + } + return *this; + } + + // Movable (needed so Stan can return-by-value) + concurrent_vector(concurrent_vector&& other) noexcept : size_(0) { + for (auto& p : segments_) p.store(nullptr, std::memory_order_relaxed); + move_from_(other); + } + + concurrent_vector& operator=(concurrent_vector&& other) noexcept { + if (this != &other) { + destroy_all_(); + for (auto& p : segments_) p.store(nullptr, std::memory_order_relaxed); + size_.store(0, std::memory_order_relaxed); + move_from_(other); + } + return *this; + } + + ~concurrent_vector() noexcept { destroy_all_(); } std::size_t size() const noexcept { return size_.load(std::memory_order_acquire); @@ -61,40 +80,39 @@ class concurrent_vector { bool empty() const noexcept { return size() == 0; } - // Non-concurrent: safe only when no other threads are pushing/reading. + // Not concurrent with pushes/reads. void clear() { destroy_all_(); size_.store(0, std::memory_order_release); } - // Concurrent append (construct in place). Returns the index. + // Pre-allocate enough segments to back indices [0, capacity-1]. + // Safe to call concurrently with emplace_back (may race allocating segments; losers free). + void reserve(std::size_t capacity) { + if (capacity == 0) return; + const std::size_t last = capacity - 1; + const std::size_t last_seg = segment_index_(last); + if (last_seg >= MaxSegments) { + throw std::length_error("concurrent_vector::reserve: exceeds MaxSegments"); + } + for (std::size_t s = 0; s <= last_seg; ++s) { + ensure_segment_(s); + } + } + template std::size_t emplace_back(Args&&... args) { - // Claim an index const std::size_t idx = size_.fetch_add(1, std::memory_order_acq_rel); - - // Ensure the segment exists T* seg = ensure_segment_for_index_(idx); - - // Placement-new into the correct slot - const std::size_t off = offset_in_segment_(idx); - T* slot = seg + off; - - // Construct element + T* slot = seg + offset_in_segment_(idx); ::new (static_cast(slot)) T(std::forward(args)...); - - // If you need "safe iteration by other threads that use size()", - // you must publish construction completion separately, e.g.: - // constructed_[idx].store(true, release); - // and readers check constructed_[i].load(acquire). return idx; } std::size_t push_back(const T& v) { return emplace_back(v); } std::size_t push_back(T&& v) { return emplace_back(std::move(v)); } - // Returns pointer to element at i (no bounds check). - // Safe if element i is fully constructed and lifetime is valid. + // Pointer helper (no bounds check). T* data_at(std::size_t i) noexcept { T* seg = segment_ptr_(segment_index_(i)); return seg + offset_in_segment_(i); @@ -104,30 +122,99 @@ class concurrent_vector { return seg + offset_in_segment_(i); } - // Bounds-checked access (still not concurrent-safe unless you have a - // protocol). + // Bounds-checked access. T& at(std::size_t i) { - if (i >= size()) - throw std::out_of_range("concurrent_vector::at"); + if (i >= size()) throw std::out_of_range("concurrent_vector::at"); return *data_at(i); } const T& at(std::size_t i) const { - if (i >= size()) - throw std::out_of_range("concurrent_vector::at"); + if (i >= size()) throw std::out_of_range("concurrent_vector::at"); return *data_at(i); } - // Unchecked access + // Unchecked access. T& operator[](std::size_t i) noexcept { return *data_at(i); } const T& operator[](std::size_t i) const noexcept { return *data_at(i); } - // Capacity is segmented and unbounded until MaxSegments is exceeded. - // This is the max number of elements representable by the segment scheme. - static constexpr std::size_t max_size() noexcept { - // Total capacity = Base * (2^MaxSegments - 1) - // but beware overflow for large MaxSegments. - return BaseSegmentSize * ((std::size_t{1} << MaxSegments) - 1); + // ------------------------- + // Iterators (InputIterator is enough for std::vector(first,last)) + // ------------------------- + + class iterator { + public: + using iterator_category = std::input_iterator_tag; + using value_type = T; + using difference_type = std::ptrdiff_t; + using pointer = T*; + using reference = T&; + + iterator() : v_(nullptr), i_(0) {} + iterator(concurrent_vector* v, std::size_t i) : v_(v), i_(i) {} + + reference operator*() const { return (*v_)[i_]; } + pointer operator->() const { return &(*v_)[i_]; } + + iterator& operator++() { ++i_; return *this; } + iterator operator++(int) { iterator tmp = *this; ++(*this); return tmp; } + + friend bool operator==(const iterator& a, const iterator& b) { + return a.v_ == b.v_ && a.i_ == b.i_; + } + friend bool operator!=(const iterator& a, const iterator& b) { return !(a == b); } + + private: + concurrent_vector* v_; + std::size_t i_; + }; + + class const_iterator { + public: + using iterator_category = std::input_iterator_tag; + using value_type = T; + using difference_type = std::ptrdiff_t; + using pointer = const T*; + using reference = const T&; + + const_iterator() : v_(nullptr), i_(0) {} + const_iterator(const concurrent_vector* v, std::size_t i) : v_(v), i_(i) {} + + reference operator*() const { return (*v_)[i_]; } + pointer operator->() const { return &(*v_)[i_]; } + + const_iterator& operator++() { ++i_; return *this; } + const_iterator operator++(int) { const_iterator tmp = *this; ++(*this); return tmp; } + + friend bool operator==(const const_iterator& a, const const_iterator& b) { + return a.v_ == b.v_ && a.i_ == b.i_; + } + friend bool operator!=(const const_iterator& a, const const_iterator& b) { return !(a == b); } + + private: + const concurrent_vector* v_; + std::size_t i_; + }; + + iterator begin() noexcept { return iterator(this, 0); } + iterator end() noexcept { return iterator(this, size()); } // snapshot at call time + + const_iterator begin() const noexcept { return const_iterator(this, 0); } + const_iterator end() const noexcept { return const_iterator(this, size()); } + + const_iterator cbegin() const noexcept { return const_iterator(this, 0); } + const_iterator cend() const noexcept { return const_iterator(this, size()); } + + T& back() { + const std::size_t n = size(); + if (n == 0) throw std::out_of_range("concurrent_vector::back on empty"); + return (*this)[n - 1]; + } + + const T& back() const { + const std::size_t n = size(); + if (n == 0) throw std::out_of_range("concurrent_vector::back on empty"); + return (*this)[n - 1]; } + // ------------------------- private: // Segment k has size BaseSegmentSize * 2^k @@ -135,28 +222,29 @@ class concurrent_vector { return BaseSegmentSize << k; } - // Prefix count before segment k: - // Base * (2^k - 1) + // Prefix elements before segment k: Base * (2^k - 1) static constexpr std::size_t segment_prefix_(std::size_t k) noexcept { return BaseSegmentSize * ((std::size_t{1} << k) - 1); } - // Map global index -> segment index. + // Map global index -> segment index // Let q = idx / Base. Then segment = floor(log2(q + 1)). static std::size_t segment_index_(std::size_t idx) noexcept { const std::size_t q = idx / BaseSegmentSize; const std::size_t x = q + 1; #if defined(__GNUG__) || defined(__clang__) - // floor(log2(x)) via clz - return (sizeof(std::size_t) * 8 - 1) - - static_cast(__builtin_clzl(x)); + if constexpr (sizeof(std::size_t) == 8) { + return 63u - static_cast( + __builtin_clzll(static_cast(x))); + } else { + return 31u - static_cast( + __builtin_clzl(static_cast(x))); + } #else - // portable fallback std::size_t s = 0; std::size_t t = x; - while (t >>= 1) - ++s; + while (t >>= 1) ++s; return s; #endif } @@ -173,60 +261,76 @@ class concurrent_vector { return static_cast(segments_[s].load(std::memory_order_acquire)); } - T* ensure_segment_for_index_(std::size_t idx) { - const std::size_t s = segment_index_(idx); - if (s >= MaxSegments) { - throw std::length_error("concurrent_vector: exceeded MaxSegments"); - } - + T* ensure_segment_(std::size_t s) { T* seg = segment_ptr_(s); - if (seg) - return seg; + if (seg) return seg; - // Allocate segment lazily (raw storage for T objects) const std::size_t n = segment_size_(s); void* raw = ::operator new(sizeof(T) * n); T* fresh = static_cast(raw); - // CAS install; if another thread won, free ours. void* expected = nullptr; if (!segments_[s].compare_exchange_strong(expected, fresh, std::memory_order_release, std::memory_order_acquire)) { ::operator delete(raw); - seg = static_cast(segments_[s].load(std::memory_order_acquire)); + seg = segment_ptr_(s); assert(seg != nullptr); return seg; } - return fresh; } - // Destroy constructed elements and free segments. - // Not concurrent with pushes or reads. + T* ensure_segment_for_index_(std::size_t idx) { + const std::size_t s = segment_index_(idx); + if (s >= MaxSegments) { + throw std::length_error("concurrent_vector: exceeded MaxSegments"); + } + return ensure_segment_(s); + } + void destroy_all_() noexcept { const std::size_t n = size_.load(std::memory_order_acquire); - // Destroy elements that were constructed. - // NOTE: This assumes indices [0, n) are all constructed. - // If you allow exceptions or partial construction, track constructed flags. + // Assumes [0, n) constructed. for (std::size_t i = 0; i < n; ++i) { data_at(i)->~T(); } - // Free segments - for (std::size_t s = 0; s < segments_.size(); ++s) { - void* p = segments_[s].load(std::memory_order_acquire); - if (p) { - ::operator delete(p); - segments_[s].store(nullptr, std::memory_order_relaxed); - } + for (auto& a : segments_) { + void* p = a.exchange(nullptr, std::memory_order_acq_rel); + if (p) ::operator delete(p); + } + } + + void move_from_(concurrent_vector& other) noexcept { + // Steal size + const std::size_t n = other.size_.exchange(0, std::memory_order_acq_rel); + size_.store(n, std::memory_order_release); + + // Steal segments + for (std::size_t s = 0; s < MaxSegments; ++s) { + void* p = other.segments_[s].exchange(nullptr, std::memory_order_acq_rel); + segments_[s].store(p, std::memory_order_release); } } + void copy_from_(const concurrent_vector& other) { + const std::size_t n = other.size(); + if (n == 0) return; + + reserve(n); + // Important: we want size_ to match, but we must construct elements. + // Use emplace_back so construction happens in this container. + for (std::size_t i = 0; i < n; ++i) { + emplace_back(other[i]); + } + } + std::atomic size_; - std::vector> segments_; + std::array, MaxSegments> segments_; }; + } // namespace internal } // namespace math } // namespace stan diff --git a/stan/math/opencl/kernel_cl.hpp b/stan/math/opencl/kernel_cl.hpp index 8905243d709..cd97bda09c0 100644 --- a/stan/math/opencl/kernel_cl.hpp +++ b/stan/math/opencl/kernel_cl.hpp @@ -109,17 +109,17 @@ inline void assign_events(const cl::Event& new_event, CallArg& m, * @return A vector of OpenCL events. */ template * = nullptr> -inline internal::concurrent_vector select_events(const T& m) { - return internal::concurrent_vector{}; +inline stan::math::internal::concurrent_vector select_events(const T& m) { + return stan::math::internal::concurrent_vector{}; } template * = nullptr, require_same_t* = nullptr> -inline const internal::concurrent_vector& select_events(const K& m) { +inline const stan::math::internal::concurrent_vector& select_events(const K& m) { return m.write_events(); } template * = nullptr, require_any_same_t* = nullptr> -inline internal::concurrent_vector select_events(K& m) { +inline stan::math::internal::concurrent_vector select_events(K& m) { static_assert(!std::is_const::value, "Can not write to const matrix_cl!"); return m.read_write_events(); } diff --git a/stan/math/opencl/opencl_context.hpp b/stan/math/opencl/opencl_context.hpp index b0c435e33e0..6f83296e39b 100644 --- a/stan/math/opencl/opencl_context.hpp +++ b/stan/math/opencl/opencl_context.hpp @@ -417,8 +417,11 @@ class opencl_context { * @param instance_id if of the device */ inline void select_device(int platform_id, int instance_id) { - for (cl::Kernel* cache : kernel_caches_) { - *cache = cl::Kernel(); + for (std::size_t i = 0; i < kernel_caches_.size(); ++i) { + cl::Kernel*& cache = kernel_caches_[i]; + if (cache) { + *cache = cl::Kernel(); + } } kernel_caches_.clear(); opencl_context_base::select_device(platform_id, instance_id); From 1b7c545755a3850de76980f3dc87795e057e4b3b Mon Sep 17 00:00:00 2001 From: Stan Jenkins Date: Tue, 23 Dec 2025 21:39:58 -0500 Subject: [PATCH 15/17] [Jenkins] auto-formatting by clang-format version 10.0.0-4ubuntu1 --- stan/math/opencl/concurrent_vector.hpp | 94 ++++++++++++++++++-------- stan/math/opencl/kernel_cl.hpp | 6 +- stan/math/opencl/opencl_context.hpp | 2 +- 3 files changed, 70 insertions(+), 32 deletions(-) diff --git a/stan/math/opencl/concurrent_vector.hpp b/stan/math/opencl/concurrent_vector.hpp index 095643daf61..1dcadd49152 100644 --- a/stan/math/opencl/concurrent_vector.hpp +++ b/stan/math/opencl/concurrent_vector.hpp @@ -27,8 +27,8 @@ namespace internal { * * Notes: * - Intended for append-then-read patterns. - * - size_ increments before construction finishes. If readers iterate up to size() - * concurrently with writers, you need a constructed/published protocol. + * - size_ increments before construction finishes. If readers iterate up to + * size() concurrently with writers, you need a constructed/published protocol. * - clear()/destruction are NOT concurrent with pushes/reads. */ template = MaxSegments) { - throw std::length_error("concurrent_vector::reserve: exceeds MaxSegments"); + throw std::length_error( + "concurrent_vector::reserve: exceeds MaxSegments"); } for (std::size_t s = 0; s <= last_seg; ++s) { ensure_segment_(s); @@ -124,11 +130,13 @@ class concurrent_vector { // Bounds-checked access. T& at(std::size_t i) { - if (i >= size()) throw std::out_of_range("concurrent_vector::at"); + if (i >= size()) + throw std::out_of_range("concurrent_vector::at"); return *data_at(i); } const T& at(std::size_t i) const { - if (i >= size()) throw std::out_of_range("concurrent_vector::at"); + if (i >= size()) + throw std::out_of_range("concurrent_vector::at"); return *data_at(i); } @@ -154,13 +162,22 @@ class concurrent_vector { reference operator*() const { return (*v_)[i_]; } pointer operator->() const { return &(*v_)[i_]; } - iterator& operator++() { ++i_; return *this; } - iterator operator++(int) { iterator tmp = *this; ++(*this); return tmp; } + iterator& operator++() { + ++i_; + return *this; + } + iterator operator++(int) { + iterator tmp = *this; + ++(*this); + return tmp; + } friend bool operator==(const iterator& a, const iterator& b) { return a.v_ == b.v_ && a.i_ == b.i_; } - friend bool operator!=(const iterator& a, const iterator& b) { return !(a == b); } + friend bool operator!=(const iterator& a, const iterator& b) { + return !(a == b); + } private: concurrent_vector* v_; @@ -181,13 +198,22 @@ class concurrent_vector { reference operator*() const { return (*v_)[i_]; } pointer operator->() const { return &(*v_)[i_]; } - const_iterator& operator++() { ++i_; return *this; } - const_iterator operator++(int) { const_iterator tmp = *this; ++(*this); return tmp; } + const_iterator& operator++() { + ++i_; + return *this; + } + const_iterator operator++(int) { + const_iterator tmp = *this; + ++(*this); + return tmp; + } friend bool operator==(const const_iterator& a, const const_iterator& b) { return a.v_ == b.v_ && a.i_ == b.i_; } - friend bool operator!=(const const_iterator& a, const const_iterator& b) { return !(a == b); } + friend bool operator!=(const const_iterator& a, const const_iterator& b) { + return !(a == b); + } private: const concurrent_vector* v_; @@ -195,7 +221,9 @@ class concurrent_vector { }; iterator begin() noexcept { return iterator(this, 0); } - iterator end() noexcept { return iterator(this, size()); } // snapshot at call time + iterator end() noexcept { + return iterator(this, size()); + } // snapshot at call time const_iterator begin() const noexcept { return const_iterator(this, 0); } const_iterator end() const noexcept { return const_iterator(this, size()); } @@ -205,13 +233,15 @@ class concurrent_vector { T& back() { const std::size_t n = size(); - if (n == 0) throw std::out_of_range("concurrent_vector::back on empty"); + if (n == 0) + throw std::out_of_range("concurrent_vector::back on empty"); return (*this)[n - 1]; } const T& back() const { const std::size_t n = size(); - if (n == 0) throw std::out_of_range("concurrent_vector::back on empty"); + if (n == 0) + throw std::out_of_range("concurrent_vector::back on empty"); return (*this)[n - 1]; } // ------------------------- @@ -235,16 +265,19 @@ class concurrent_vector { #if defined(__GNUG__) || defined(__clang__) if constexpr (sizeof(std::size_t) == 8) { - return 63u - static_cast( - __builtin_clzll(static_cast(x))); + return 63u + - static_cast( + __builtin_clzll(static_cast(x))); } else { - return 31u - static_cast( - __builtin_clzl(static_cast(x))); + return 31u + - static_cast( + __builtin_clzl(static_cast(x))); } #else std::size_t s = 0; std::size_t t = x; - while (t >>= 1) ++s; + while (t >>= 1) + ++s; return s; #endif } @@ -263,7 +296,8 @@ class concurrent_vector { T* ensure_segment_(std::size_t s) { T* seg = segment_ptr_(s); - if (seg) return seg; + if (seg) + return seg; const std::size_t n = segment_size_(s); void* raw = ::operator new(sizeof(T) * n); @@ -299,7 +333,8 @@ class concurrent_vector { for (auto& a : segments_) { void* p = a.exchange(nullptr, std::memory_order_acq_rel); - if (p) ::operator delete(p); + if (p) + ::operator delete(p); } } @@ -317,7 +352,8 @@ class concurrent_vector { void copy_from_(const concurrent_vector& other) { const std::size_t n = other.size(); - if (n == 0) return; + if (n == 0) + return; reserve(n); // Important: we want size_ to match, but we must construct elements. @@ -326,7 +362,7 @@ class concurrent_vector { emplace_back(other[i]); } } - + std::atomic size_; std::array, MaxSegments> segments_; }; diff --git a/stan/math/opencl/kernel_cl.hpp b/stan/math/opencl/kernel_cl.hpp index cd97bda09c0..3dda705d331 100644 --- a/stan/math/opencl/kernel_cl.hpp +++ b/stan/math/opencl/kernel_cl.hpp @@ -109,12 +109,14 @@ inline void assign_events(const cl::Event& new_event, CallArg& m, * @return A vector of OpenCL events. */ template * = nullptr> -inline stan::math::internal::concurrent_vector select_events(const T& m) { +inline stan::math::internal::concurrent_vector select_events( + const T& m) { return stan::math::internal::concurrent_vector{}; } template * = nullptr, require_same_t* = nullptr> -inline const stan::math::internal::concurrent_vector& select_events(const K& m) { +inline const stan::math::internal::concurrent_vector& select_events( + const K& m) { return m.write_events(); } template * = nullptr, diff --git a/stan/math/opencl/opencl_context.hpp b/stan/math/opencl/opencl_context.hpp index 6f83296e39b..53e65a05c1d 100644 --- a/stan/math/opencl/opencl_context.hpp +++ b/stan/math/opencl/opencl_context.hpp @@ -420,7 +420,7 @@ class opencl_context { for (std::size_t i = 0; i < kernel_caches_.size(); ++i) { cl::Kernel*& cache = kernel_caches_[i]; if (cache) { - *cache = cl::Kernel(); + *cache = cl::Kernel(); } } kernel_caches_.clear(); From 00c81b86d3065919de136904568a1530ae842596 Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Tue, 23 Dec 2025 21:40:50 -0500 Subject: [PATCH 16/17] update --- stan/math/opencl/opencl_context.hpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/stan/math/opencl/opencl_context.hpp b/stan/math/opencl/opencl_context.hpp index 53e65a05c1d..b0c435e33e0 100644 --- a/stan/math/opencl/opencl_context.hpp +++ b/stan/math/opencl/opencl_context.hpp @@ -417,11 +417,8 @@ class opencl_context { * @param instance_id if of the device */ inline void select_device(int platform_id, int instance_id) { - for (std::size_t i = 0; i < kernel_caches_.size(); ++i) { - cl::Kernel*& cache = kernel_caches_[i]; - if (cache) { - *cache = cl::Kernel(); - } + for (cl::Kernel* cache : kernel_caches_) { + *cache = cl::Kernel(); } kernel_caches_.clear(); opencl_context_base::select_device(platform_id, instance_id); From e95bf022434185c59927ab374e796857a59d8a1c Mon Sep 17 00:00:00 2001 From: Daniel Lee Date: Wed, 24 Dec 2025 16:43:06 -0500 Subject: [PATCH 17/17] cpplint --- stan/math/rev/core/team_thread_pool.hpp | 4 ++-- stan/math/rev/functor/map_rect_concurrent.hpp | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/stan/math/rev/core/team_thread_pool.hpp b/stan/math/rev/core/team_thread_pool.hpp index 2a77ff644b9..90db011cb7e 100644 --- a/stan/math/rev/core/team_thread_pool.hpp +++ b/stan/math/rev/core/team_thread_pool.hpp @@ -146,10 +146,10 @@ class TeamThreadPool { if (!s || !*s) return 0; char* end = nullptr; - long v = std::strtol(s, &end, 10); + std::size_t v = static_cast(std::strtol(s, &end, 10)); if (end == s || v <= 0) return 0; - return static_cast(v); + return v; } static std::size_t configured_cap_(std::size_t hw) noexcept { diff --git a/stan/math/rev/functor/map_rect_concurrent.hpp b/stan/math/rev/functor/map_rect_concurrent.hpp index 76d38866885..fe2dee2a380 100644 --- a/stan/math/rev/functor/map_rect_concurrent.hpp +++ b/stan/math/rev/functor/map_rect_concurrent.hpp @@ -9,9 +9,6 @@ #include #include -//#include -//#include - #include #include #include