From 9f72cc36bdc5a9a4e4c2ad3e582dc666c9001010 Mon Sep 17 00:00:00 2001
From: Daniel Lee <bearlee@alum.mit.edu>
Date: Thu, 20 Nov 2025 22:03:04 -0500
Subject: [PATCH 01/17] wip: reduce_sum

---
 make/compiler_flags                           |   2 +-
 stan/math/rev/functor/map_rect_concurrent.hpp |  48 +++--
 stan/math/rev/functor/reduce_sum.hpp          | 168 +++++++++++++++++-
 3 files changed, 203 insertions(+), 15 deletions(-)
diff --git a/make/compiler_flags b/make/compiler_flags
index 177bebca39b..0febc5342de 100644
--- a/make/compiler_flags
+++ b/make/compiler_flags
@@ -342,7 +342,7 @@ endif
 # Sets up CXXFLAGS_THREADS to use threading
 
 ifdef STAN_THREADS
-  CXXFLAGS_THREADS ?= -DSTAN_THREADS
+  CXXFLAGS_THREADS ?= -DSTAN_THREADS -pthread
 endif
 
 ################################################################################
diff --git a/stan/math/rev/functor/map_rect_concurrent.hpp b/stan/math/rev/functor/map_rect_concurrent.hpp
index 878261056fb..4e32543adaf 100644
--- a/stan/math/rev/functor/map_rect_concurrent.hpp
+++ b/stan/math/rev/functor/map_rect_concurrent.hpp
@@ -12,6 +12,8 @@
 #include <tbb/blocked_range.h>
 
 #include <algorithm>
+#include <numeric>
+#include <thread>
 #include <vector>
 
 namespace stan {
@@ -46,18 +48,40 @@ map_rect_concurrent(
   };
 
 #ifdef STAN_THREADS
-  // we must use task isolation as described here:
-  // https://software.intel.com/content/www/us/en/develop/documentation/tbb-documentation/top/intel-threading-building-blocks-developer-guide/task-isolation.html
-  // this is to ensure that the thread local AD tape ressource is
-  // not being modified from a different task which may happen
-  // whenever this function is being used itself in a parallel
-  // context (like running multiple chains for Stan)
-  tbb::this_task_arena::isolate([&] {
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(0, num_jobs),
-                      [&](const tbb::blocked_range<size_t>& r) {
-                        execute_chunk(r.begin(), r.end());
-                      });
-  });
+  std::cout << "********************************************************************************" << std::endl;
+  if (num_jobs > 1) {
+    // simple chunked threading over [0, num_jobs)
+    unsigned hw_threads = std::thread::hardware_concurrency();
+    if (hw_threads == 0) {
+      hw_threads = 2;  // arbitrary but > 0
+    }
+
+    const unsigned max_threads
+        = static_cast<unsigned>(std::min<std::size_t>(hw_threads, num_jobs));
+    std::cout << "max_threads = " << max_threads << std::endl;
+    std::vector<std::thread> threads;
+    threads.reserve(max_threads);
+
+    const std::size_t chunk
+        = (num_jobs + max_threads - 1) / max_threads;  // ceil
+
+    for (unsigned t = 0; t < max_threads; ++t) {
+      const std::size_t start = t * chunk;
+      if (start >= num_jobs) break;
+      const std::size_t end
+          = std::min<std::size_t>(start + chunk, num_jobs);
+
+      threads.emplace_back([&, start, end] {
+        execute_chunk(start, end);
+      });
+    }
+
+    for (auto& th : threads) {
+      th.join();
+    }
+  } else {
+    execute_chunk(0, num_jobs);
+  }
 #else
   execute_chunk(0, num_jobs);
 #endif
diff --git a/stan/math/rev/functor/reduce_sum.hpp b/stan/math/rev/functor/reduce_sum.hpp
index 25cbef1e073..9878acb1307 100644
--- a/stan/math/rev/functor/reduce_sum.hpp
+++ b/stan/math/rev/functor/reduce_sum.hpp
@@ -5,6 +5,8 @@
 #include <stan/math/prim/functor.hpp>
 #include <stan/math/rev/core.hpp>
 
+#include <thread>
+
 #include <tbb/task_arena.h>
 #include <tbb/parallel_reduce.h>
 #include <tbb/blocked_range.h>
@@ -74,13 +76,79 @@ struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType,
      *   to zero since the newly created reducer is used to accumulate
      *   an independent partial sum.
      */
+    /*
     recursive_reducer(recursive_reducer& other, tbb::split)
         : num_vars_per_term_(other.num_vars_per_term_),
           num_vars_shared_terms_(other.num_vars_shared_terms_),
           sliced_partials_(other.sliced_partials_),
           vmapped_(other.vmapped_),
           args_tuple_(other.args_tuple_) {}
+    */
+
+inline void operator()(std::size_t begin, std::size_t end) {
+  if (begin == end) {
+    return;
+  }
+
+  if (args_adjoints_.size() == 0) {
+    args_adjoints_ = Eigen::VectorXd::Zero(num_vars_shared_terms_);
+  }
 
+  // local copy of shared arguments in a local stack
+  if (!local_args_tuple_scope_.args_tuple_holder_) {
+    local_args_tuple_scope_.stack_.execute([&]() {
+      math::apply(
+          [&](auto&&... args) {
+            local_args_tuple_scope_.args_tuple_holder_ =
+                std::make_unique<typename scoped_args_tuple::args_tuple_t>(
+                    deep_copy_vars(args)...);
+          },
+          args_tuple_);
+    });
+  } else {
+    // set adjoints of shared arguments to zero
+    local_args_tuple_scope_.stack_.execute([] { set_zero_all_adjoints(); });
+  }
+
+  auto& args_tuple_local = *(local_args_tuple_scope_.args_tuple_holder_);
+
+  // Initialize nested autodiff stack
+  const nested_rev_autodiff begin_nest;
+
+  // Create nested autodiff copies of sliced argument that do not point
+  // back to main autodiff stack
+  std::decay_t<Vec> local_sub_slice;
+  local_sub_slice.reserve(end - begin);
+  for (std::size_t i = begin; i < end; ++i) {
+    local_sub_slice.emplace_back(deep_copy_vars(vmapped_[i]));
+  }
+
+  // Perform calculation
+  var sub_sum_v = math::apply(
+      [&](auto&&... args) {
+        return ReduceFunction()(local_sub_slice, begin, end - 1, &msgs_,
+                                args...);
+      },
+      args_tuple_local);
+
+  // Compute Jacobian
+  sub_sum_v.grad();
+
+  // accumulate value
+  sum_ += sub_sum_v.val();
+
+  // accumulate adjoints of sliced_arguments
+  accumulate_adjoints(sliced_partials_ + begin * num_vars_per_term_,
+                      std::move(local_sub_slice));
+
+  // accumulate adjoints of shared_arguments
+  math::apply(
+      [&](auto&&... args) {
+        accumulate_adjoints(args_adjoints_.data(), args...);
+      },
+      args_tuple_local);
+}
+    
     /**
      * Compute, using nested autodiff, the value and Jacobian of
      *  `ReduceFunction` called over the range defined by r and accumulate those
@@ -94,7 +162,7 @@ struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType,
      *
      * @param r Range over which to compute reduce_sum
      */
-    inline void operator()(const tbb::blocked_range<size_t>& r) {
+    /*   inline void operator()(const tbb::blocked_range<size_t>& r) {
       if (r.empty()) {
         return;
       }
@@ -163,7 +231,7 @@ struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType,
           },
           args_tuple_local);
     }
-
+    */
     /**
      * Join reducers. Accumuluate the value (sum_) and Jacobian (arg_adoints_)
      *   of the other reducer.
@@ -221,6 +289,101 @@ struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType,
    * @param args Shared arguments used in every sum term
    * @return Summation of all terms
    */
+  inline var operator()(Vec&& vmapped, bool /*auto_partitioning*/, int /*grainsize*/,
+                      std::ostream* msgs, Args&&... args) const {
+  if (vmapped.empty()) {
+    return var(0.0);
+  }
+
+  const std::size_t num_terms = vmapped.size();
+  const std::size_t num_vars_per_term = count_vars(vmapped[0]);
+  const std::size_t num_vars_sliced_terms = num_terms * num_vars_per_term;
+  const std::size_t num_vars_shared_terms = count_vars(args...);
+
+  vari** varis
+      = ChainableStack::instance_->memalloc_.alloc_array<vari*>(
+          num_vars_sliced_terms + num_vars_shared_terms);
+  double* partials
+      = ChainableStack::instance_->memalloc_.alloc_array<double>(
+          num_vars_sliced_terms + num_vars_shared_terms);
+
+  save_varis(varis, vmapped);
+  save_varis(varis + num_vars_sliced_terms, args...);
+
+  for (std::size_t i = 0; i < num_vars_sliced_terms; ++i) {
+    partials[i] = 0.0;
+  }
+
+  // --- simple std::thread parallelism ---
+
+  // how many threads to use
+  const unsigned hw = std::thread::hardware_concurrency();
+  const std::size_t max_threads = hw == 0 ? 2 : hw;
+  const std::size_t num_threads = std::min<std::size_t>(max_threads, num_terms);
+
+  // each thread gets its own reducer, but they all share the same partials buffer
+  // (sliced_partials_) and write to disjoint regions
+  std::vector<std::unique_ptr<recursive_reducer>> workers;
+  workers.reserve(num_threads);
+
+  std::vector<std::thread> threads;
+  threads.reserve(num_threads);
+
+  std::size_t block_begin = 0;
+  for (std::size_t t = 0; t < num_threads; ++t) {
+    std::size_t block_end
+        = (t + 1 == num_threads)
+              ? num_terms
+              : (num_terms * (t + 1)) / num_threads;
+
+    // construct reducer for this thread
+    workers.emplace_back(std::make_unique<recursive_reducer>(
+        num_vars_per_term, num_vars_shared_terms, partials,
+        vmapped, args...));
+
+    auto* wptr = workers.back().get();
+
+    threads.emplace_back([wptr, block_begin, block_end]() {
+      // each worker thread needs its own AD tape
+      static thread_local ChainableStack ad_tape;
+      wptr->operator()(block_begin, block_end);
+    });
+
+    block_begin = block_end;
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+
+  // aggregate results
+  double total_sum = 0.0;
+  Eigen::VectorXd shared_adjoints
+      = Eigen::VectorXd::Zero(num_vars_shared_terms);
+  std::stringstream all_msgs;
+
+  for (auto& w : workers) {
+    total_sum += w->sum_;
+    if (w->args_adjoints_.size() != 0) {
+      shared_adjoints += w->args_adjoints_;
+    }
+    all_msgs << w->msgs_.str();
+  }
+
+  for (std::size_t i = 0; i < num_vars_shared_terms; ++i) {
+    partials[num_vars_sliced_terms + i] = shared_adjoints.coeff(i);
+  }
+
+  if (msgs) {
+    *msgs << all_msgs.str();
+  }
+
+  return var(
+      new precomputed_gradients_vari(total_sum,
+                                     num_vars_sliced_terms + num_vars_shared_terms,
+                                     varis, partials));
+}
+  /*
   inline var operator()(Vec&& vmapped, bool auto_partitioning, int grainsize,
                         std::ostream* msgs, Args&&... args) const {
     if (vmapped.empty()) {
@@ -278,6 +441,7 @@ struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType,
         worker.sum_, num_vars_sliced_terms + num_vars_shared_terms, varis,
         partials));
   }
+  */
 };
 }  // namespace internal
 

From a7a95fb71593ebe90829719755fb39a41d993e39 Mon Sep 17 00:00:00 2001
From: Daniel Lee <bearlee@alum.mit.edu>
Date: Thu, 4 Dec 2025 10:36:19 -0500
Subject: [PATCH 02/17] 10x slower

---
 stan/math/rev/core/simple_thread_pool.hpp | 155 ++++++++
 stan/math/rev/functor/reduce_sum.hpp      | 446 ++++++----------------
 2 files changed, 278 insertions(+), 323 deletions(-)
 create mode 100644 stan/math/rev/core/simple_thread_pool.hpp

diff --git a/stan/math/rev/core/simple_thread_pool.hpp b/stan/math/rev/core/simple_thread_pool.hpp
new file mode 100644
index 00000000000..84e1adf7c55
--- /dev/null
+++ b/stan/math/rev/core/simple_thread_pool.hpp
@@ -0,0 +1,155 @@
+#ifndef STAN_MATH_REV_CORE_SIMPLE_THREAD_POOL_HPP
+#define STAN_MATH_REV_CORE_SIMPLE_THREAD_POOL_HPP
+
+#include <stan/math/rev/core/chainablestack.hpp>
+
+#include <atomic>
+#include <condition_variable>
+#include <cstddef>
+#include <functional>
+#include <future>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace stan {
+namespace math {
+
+class SimpleThreadPool {
+public:
+  static SimpleThreadPool& instance() {
+    static SimpleThreadPool pool;
+    return pool;
+  }
+
+  SimpleThreadPool(const SimpleThreadPool&) = delete;
+  SimpleThreadPool& operator=(const SimpleThreadPool&) = delete;
+
+  std::size_t thread_count() const noexcept { return workers_.size(); }
+
+  template <typename F, typename... Args>
+  auto submit(F&& f, Args&&... args)
+      -> std::future<std::invoke_result_t<F, Args...>> {
+    using R = std::invoke_result_t<F, Args...>;
+
+    auto task_ptr = std::make_shared<std::packaged_task<R()>>(
+        std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+
+    enqueue_([task_ptr] { (*task_ptr)(); });
+    return task_ptr->get_future();
+  }
+
+  template <typename F>
+  void parallel_region(std::size_t n, F&& fn) {
+    if (n == 0) return;
+
+    // Avoid nested parallelism deadlocks/oversubscription.
+    if (in_worker_) {
+      fn(std::size_t{0});
+      return;
+    }
+
+    const std::size_t tc = thread_count();
+    if (tc == 0) {
+      fn(std::size_t{0});
+      return;
+    }
+    if (n > tc) n = tc;
+
+    using Fn = std::decay_t<F>;
+    struct Shared {
+      std::atomic<std::size_t> remaining;
+      std::mutex m;
+      std::condition_variable cv;
+      Fn fn;
+      Shared(std::size_t n_, Fn&& f_) : remaining(n_), fn(std::move(f_)) {}
+    };
+
+    auto shared = std::make_shared<Shared>(n, Fn(std::forward<F>(fn)));
+
+    for (std::size_t tid = 0; tid < n; ++tid) {
+      enqueue_([shared, tid] {
+        shared->fn(tid);
+        if (shared->remaining.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+          std::lock_guard<std::mutex> lk(shared->m);
+          shared->cv.notify_one();
+        }
+      });
+    }
+
+    std::unique_lock<std::mutex> lk(shared->m);
+    shared->cv.wait(lk, [&] {
+      return shared->remaining.load(std::memory_order_acquire) == 0;
+    });
+  }
+
+private:
+  SimpleThreadPool() : done_(false) {
+    unsigned hw = std::thread::hardware_concurrency();
+    if (hw == 0) hw = 2;
+    const unsigned num_threads = hw;
+
+    workers_.reserve(num_threads);
+    for (unsigned i = 0; i < num_threads; ++i) {
+      workers_.emplace_back([this] {
+        // Per-worker AD tape (TLS) initialized once.
+        static thread_local ChainableStack ad_tape;
+
+        for (;;) {
+          std::function<void()> task;
+          {
+            std::unique_lock<std::mutex> lock(mtx_);
+            cv_.wait(lock, [&] { return done_ || !tasks_.empty(); });
+            if (done_ && tasks_.empty()) return;
+            task = std::move(tasks_.front());
+            tasks_.pop();
+          }
+
+          WorkerScope scope;  // sets in_worker_ for all tasks
+          task();
+        }
+      });
+    }
+  }
+
+  ~SimpleThreadPool() {
+    {
+      std::lock_guard<std::mutex> lock(mtx_);
+      done_ = true;
+    }
+    cv_.notify_all();
+    for (auto& th : workers_) {
+      if (th.joinable()) th.join();
+    }
+  }
+
+  void enqueue_(std::function<void()> task) {
+    {
+      std::lock_guard<std::mutex> lock(mtx_);
+      tasks_.emplace(std::move(task));
+    }
+    cv_.notify_one();
+  }
+
+  struct WorkerScope {
+    WorkerScope() : prev_(in_worker_) { in_worker_ = true; }
+    ~WorkerScope() { in_worker_ = prev_; }
+    bool prev_;
+  };
+
+  static inline thread_local bool in_worker_ = false;
+
+  std::vector<std::thread> workers_;
+  std::queue<std::function<void()>> tasks_;
+  std::mutex mtx_;
+  std::condition_variable cv_;
+  bool done_;
+};
+
+}  // namespace math
+}  // namespace stan
+
+#endif
diff --git a/stan/math/rev/functor/reduce_sum.hpp b/stan/math/rev/functor/reduce_sum.hpp
index 9878acb1307..73d868ae097 100644
--- a/stan/math/rev/functor/reduce_sum.hpp
+++ b/stan/math/rev/functor/reduce_sum.hpp
@@ -4,15 +4,14 @@
 #include <stan/math/prim/meta.hpp>
 #include <stan/math/prim/functor.hpp>
 #include <stan/math/rev/core.hpp>
+#include <stan/math/rev/core/simple_thread_pool.hpp>
 
-#include <thread>
-
-#include <tbb/task_arena.h>
-#include <tbb/parallel_reduce.h>
-#include <tbb/blocked_range.h>
-
-#include <tuple>
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
 #include <memory>
+#include <sstream>
+#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -30,8 +29,8 @@ namespace internal {
  */
 template <typename ReduceFunction, typename ReturnType, typename Vec,
           typename... Args>
-struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType,
-                       Vec, Args...> {
+struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType, Vec,
+                       Args...> {
   struct scoped_args_tuple {
     ScopedChainableStack stack_;
     using args_tuple_t
@@ -41,129 +40,33 @@ struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType,
     scoped_args_tuple() : stack_(), args_tuple_holder_(nullptr) {}
   };
 
-  /**
-   * This struct is used by the TBB to accumulate partial
-   *  sums over consecutive ranges of the input. To distribute the workload,
-   *  the TBB can split larger partial sums into smaller ones in which
-   *  case the splitting copy constructor is used. It is designed to
-   *  meet the Imperative form requirements of `tbb::parallel_reduce`.
-   *
-   * @note see link [here](https://tinyurl.com/vp7xw2t) for requirements.
-   */
   struct recursive_reducer {
-    const size_t num_vars_per_term_;
-    const size_t num_vars_shared_terms_;  // Number of vars in shared arguments
-    double* sliced_partials_;  // Points to adjoints of the partial calculations
+    const std::size_t num_vars_per_term_;
+    const std::size_t num_vars_shared_terms_;
+    double* sliced_partials_;
+
     Vec vmapped_;
     std::stringstream msgs_;
     std::tuple<Args...> args_tuple_;
     scoped_args_tuple local_args_tuple_scope_;
+
     double sum_{0.0};
     Eigen::VectorXd args_adjoints_{0};
 
     template <typename VecT, typename... ArgsT>
-    recursive_reducer(size_t num_vars_per_term, size_t num_vars_shared_terms,
-                      double* sliced_partials, VecT&& vmapped, ArgsT&&... args)
+    recursive_reducer(std::size_t num_vars_per_term,
+                      std::size_t num_vars_shared_terms,
+                      double* sliced_partials,
+                      VecT&& vmapped,
+                      ArgsT&&... args)
         : num_vars_per_term_(num_vars_per_term),
           num_vars_shared_terms_(num_vars_shared_terms),
           sliced_partials_(sliced_partials),
           vmapped_(std::forward<VecT>(vmapped)),
           args_tuple_(std::forward<ArgsT>(args)...) {}
 
-    /*
-     * This is the copy operator as required for tbb::parallel_reduce
-     *   Imperative form. This requires sum_ and arg_adjoints_ be reset
-     *   to zero since the newly created reducer is used to accumulate
-     *   an independent partial sum.
-     */
-    /*
-    recursive_reducer(recursive_reducer& other, tbb::split)
-        : num_vars_per_term_(other.num_vars_per_term_),
-          num_vars_shared_terms_(other.num_vars_shared_terms_),
-          sliced_partials_(other.sliced_partials_),
-          vmapped_(other.vmapped_),
-          args_tuple_(other.args_tuple_) {}
-    */
-
-inline void operator()(std::size_t begin, std::size_t end) {
-  if (begin == end) {
-    return;
-  }
-
-  if (args_adjoints_.size() == 0) {
-    args_adjoints_ = Eigen::VectorXd::Zero(num_vars_shared_terms_);
-  }
-
-  // local copy of shared arguments in a local stack
-  if (!local_args_tuple_scope_.args_tuple_holder_) {
-    local_args_tuple_scope_.stack_.execute([&]() {
-      math::apply(
-          [&](auto&&... args) {
-            local_args_tuple_scope_.args_tuple_holder_ =
-                std::make_unique<typename scoped_args_tuple::args_tuple_t>(
-                    deep_copy_vars(args)...);
-          },
-          args_tuple_);
-    });
-  } else {
-    // set adjoints of shared arguments to zero
-    local_args_tuple_scope_.stack_.execute([] { set_zero_all_adjoints(); });
-  }
-
-  auto& args_tuple_local = *(local_args_tuple_scope_.args_tuple_holder_);
-
-  // Initialize nested autodiff stack
-  const nested_rev_autodiff begin_nest;
-
-  // Create nested autodiff copies of sliced argument that do not point
-  // back to main autodiff stack
-  std::decay_t<Vec> local_sub_slice;
-  local_sub_slice.reserve(end - begin);
-  for (std::size_t i = begin; i < end; ++i) {
-    local_sub_slice.emplace_back(deep_copy_vars(vmapped_[i]));
-  }
-
-  // Perform calculation
-  var sub_sum_v = math::apply(
-      [&](auto&&... args) {
-        return ReduceFunction()(local_sub_slice, begin, end - 1, &msgs_,
-                                args...);
-      },
-      args_tuple_local);
-
-  // Compute Jacobian
-  sub_sum_v.grad();
-
-  // accumulate value
-  sum_ += sub_sum_v.val();
-
-  // accumulate adjoints of sliced_arguments
-  accumulate_adjoints(sliced_partials_ + begin * num_vars_per_term_,
-                      std::move(local_sub_slice));
-
-  // accumulate adjoints of shared_arguments
-  math::apply(
-      [&](auto&&... args) {
-        accumulate_adjoints(args_adjoints_.data(), args...);
-      },
-      args_tuple_local);
-}
-    
-    /**
-     * Compute, using nested autodiff, the value and Jacobian of
-     *  `ReduceFunction` called over the range defined by r and accumulate those
-     *  in member variable sum_ (for the value) and args_adjoints_ (for the
-     *  Jacobian). The nested autodiff uses deep copies of the involved operands
-     *  ensuring that no side effects are implied to the adjoints of the input
-     *  operands which reside potentially on a autodiff tape stored in a
-     *  different thread other than the current thread of execution. This
-     * function may be called multiple times per object instantiation (so the
-     * sum_ and args_adjoints_ must be accumulated, not just assigned).
-     *
-     * @param r Range over which to compute reduce_sum
-     */
-    /*   inline void operator()(const tbb::blocked_range<size_t>& r) {
-      if (r.empty()) {
+    inline void operator()(std::size_t begin, std::size_t end) {
+      if (begin >= end) {
         return;
       }
 
@@ -171,220 +74,64 @@ inline void operator()(std::size_t begin, std::size_t end) {
         args_adjoints_ = Eigen::VectorXd::Zero(num_vars_shared_terms_);
       }
 
-      // Obtain reference to a local copy of all shared arguments that do
-      // not point
-      //   back to main autodiff stack
-
+      // Obtain reference to a local copy of all shared arguments that do not
+      // point back to main autodiff stack.
       if (!local_args_tuple_scope_.args_tuple_holder_) {
-        // shared arguments need to be copied to reducer-specific
-        // scope. In this case no need for zeroing adjoints, since the
-        // fresh copy has all adjoints set to zero.
+        // First time: copy shared arguments into reducer-scoped space.
         local_args_tuple_scope_.stack_.execute([&]() {
           math::apply(
               [&](auto&&... args) {
-                local_args_tuple_scope_.args_tuple_holder_ = std::make_unique<
-                    typename scoped_args_tuple::args_tuple_t>(
-                    deep_copy_vars(args)...);
+                local_args_tuple_scope_.args_tuple_holder_ =
+                    std::make_unique<typename scoped_args_tuple::args_tuple_t>(
+                        deep_copy_vars(args)...);
               },
               args_tuple_);
         });
       } else {
-        // set adjoints of shared arguments to zero
+        // Subsequent calls: reset adjoints of shared arguments.
         local_args_tuple_scope_.stack_.execute([] { set_zero_all_adjoints(); });
       }
 
       auto& args_tuple_local = *(local_args_tuple_scope_.args_tuple_holder_);
 
-      // Initialize nested autodiff stack
+      // Start a nested autodiff scope for this chunk.
       const nested_rev_autodiff begin_nest;
 
-      // Create nested autodiff copies of sliced argument that do not point
-      //   back to main autodiff stack
+      // Copy sliced terms into this nested stack.
       std::decay_t<Vec> local_sub_slice;
-      local_sub_slice.reserve(r.size());
-      for (size_t i = r.begin(); i < r.end(); ++i) {
+      local_sub_slice.reserve(end - begin);
+      for (std::size_t i = begin; i < end; ++i) {
         local_sub_slice.emplace_back(deep_copy_vars(vmapped_[i]));
       }
 
-      // Perform calculation
+      // Run user reducer over [begin, end-1]
       var sub_sum_v = math::apply(
           [&](auto&&... args) {
-            return ReduceFunction()(local_sub_slice, r.begin(), r.end() - 1,
-                                    &msgs_, args...);
+            return ReduceFunction()(local_sub_slice, begin, end - 1, &msgs_,
+                                    args...);
           },
           args_tuple_local);
 
-      // Compute Jacobian
+      // Compute partial derivatives within this nested stack.
       sub_sum_v.grad();
 
-      // Accumulate value of reduce_sum
+      // Accumulate value.
       sum_ += sub_sum_v.val();
 
-      // Accumulate adjoints of sliced_arguments
-      accumulate_adjoints(sliced_partials_ + r.begin() * num_vars_per_term_,
+      // Accumulate adjoints of sliced arguments into the shared partials array.
+      accumulate_adjoints(sliced_partials_ + begin * num_vars_per_term_,
                           std::move(local_sub_slice));
 
-      // Accumulate adjoints of shared_arguments
+      // Accumulate adjoints of shared arguments into this reducer's buffer.
       math::apply(
           [&](auto&&... args) {
             accumulate_adjoints(args_adjoints_.data(), args...);
           },
           args_tuple_local);
     }
-    */
-    /**
-     * Join reducers. Accumuluate the value (sum_) and Jacobian (arg_adoints_)
-     *   of the other reducer.
-     *
-     * @param rhs Another partial sum
-     */
-    inline void join(const recursive_reducer& rhs) {
-      sum_ += rhs.sum_;
-      if (args_adjoints_.size() != 0 && rhs.args_adjoints_.size() != 0) {
-        args_adjoints_ += rhs.args_adjoints_;
-      } else if (args_adjoints_.size() == 0 && rhs.args_adjoints_.size() != 0) {
-        args_adjoints_ = rhs.args_adjoints_;
-      }
-      msgs_ << rhs.msgs_.str();
-    }
   };
 
-  /**
-   * Call an instance of the function `ReduceFunction` on every element
-   *   of an input sequence and sum these terms.
-   *
-   * This specialization is parallelized using tbb and works for reverse
-   *   mode autodiff.
-   *
-   * ReduceFunction must define an operator() with the same signature as:
-   *   var f(Vec&& vmapped_subset, int start, int end, std::ostream* msgs,
-   * Args&&... args)
-   *
-   * `ReduceFunction` must be default constructible without any arguments
-   *
-   * Each call to `ReduceFunction` is responsible for computing the
-   *   start through end (inclusive) terms of the overall sum. All args are
-   * passed from this function through to the `ReduceFunction` instances.
-   *   However, only the start through end (inclusive) elements of the vmapped
-   * argument are passed to the `ReduceFunction` instances (as the
-   * `vmapped_subset` argument).
-   *
-   * This function distributes computation of the desired sum and the Jacobian
-   * of that sum over multiple threads by coordinating calls to `ReduceFunction`
-   * instances. Results are stored as precomputed varis in the autodiff tree.
-   *
-   * If auto partitioning is true, break work into pieces automatically,
-   *  taking grainsize as a recommended work size. The partitioning is
-   *  not deterministic nor is the order guaranteed in which partial
-   *  sums are accumulated. Due to floating point imprecisions this will likely
-   *  lead to slight differences in the accumulated results between
-   *  multiple runs. If false, break work deterministically into pieces smaller
-   *  than or equal to grainsize and accumulate all the partial sums
-   *  in the same order. This still may not achieve bitwise reproducibility.
-   *
-   * @param vmapped Vector containing one element per term of sum
-   * @param auto_partitioning Work partitioning style
-   * @param grainsize Suggested grainsize for tbb
-   * @param[in, out] msgs The print stream for warning messages
-   * @param args Shared arguments used in every sum term
-   * @return Summation of all terms
-   */
-  inline var operator()(Vec&& vmapped, bool /*auto_partitioning*/, int /*grainsize*/,
-                      std::ostream* msgs, Args&&... args) const {
-  if (vmapped.empty()) {
-    return var(0.0);
-  }
-
-  const std::size_t num_terms = vmapped.size();
-  const std::size_t num_vars_per_term = count_vars(vmapped[0]);
-  const std::size_t num_vars_sliced_terms = num_terms * num_vars_per_term;
-  const std::size_t num_vars_shared_terms = count_vars(args...);
-
-  vari** varis
-      = ChainableStack::instance_->memalloc_.alloc_array<vari*>(
-          num_vars_sliced_terms + num_vars_shared_terms);
-  double* partials
-      = ChainableStack::instance_->memalloc_.alloc_array<double>(
-          num_vars_sliced_terms + num_vars_shared_terms);
-
-  save_varis(varis, vmapped);
-  save_varis(varis + num_vars_sliced_terms, args...);
-
-  for (std::size_t i = 0; i < num_vars_sliced_terms; ++i) {
-    partials[i] = 0.0;
-  }
-
-  // --- simple std::thread parallelism ---
-
-  // how many threads to use
-  const unsigned hw = std::thread::hardware_concurrency();
-  const std::size_t max_threads = hw == 0 ? 2 : hw;
-  const std::size_t num_threads = std::min<std::size_t>(max_threads, num_terms);
-
-  // each thread gets its own reducer, but they all share the same partials buffer
-  // (sliced_partials_) and write to disjoint regions
-  std::vector<std::unique_ptr<recursive_reducer>> workers;
-  workers.reserve(num_threads);
-
-  std::vector<std::thread> threads;
-  threads.reserve(num_threads);
-
-  std::size_t block_begin = 0;
-  for (std::size_t t = 0; t < num_threads; ++t) {
-    std::size_t block_end
-        = (t + 1 == num_threads)
-              ? num_terms
-              : (num_terms * (t + 1)) / num_threads;
-
-    // construct reducer for this thread
-    workers.emplace_back(std::make_unique<recursive_reducer>(
-        num_vars_per_term, num_vars_shared_terms, partials,
-        vmapped, args...));
-
-    auto* wptr = workers.back().get();
-
-    threads.emplace_back([wptr, block_begin, block_end]() {
-      // each worker thread needs its own AD tape
-      static thread_local ChainableStack ad_tape;
-      wptr->operator()(block_begin, block_end);
-    });
-
-    block_begin = block_end;
-  }
-
-  for (auto& th : threads) {
-    th.join();
-  }
-
-  // aggregate results
-  double total_sum = 0.0;
-  Eigen::VectorXd shared_adjoints
-      = Eigen::VectorXd::Zero(num_vars_shared_terms);
-  std::stringstream all_msgs;
-
-  for (auto& w : workers) {
-    total_sum += w->sum_;
-    if (w->args_adjoints_.size() != 0) {
-      shared_adjoints += w->args_adjoints_;
-    }
-    all_msgs << w->msgs_.str();
-  }
-
-  for (std::size_t i = 0; i < num_vars_shared_terms; ++i) {
-    partials[num_vars_sliced_terms + i] = shared_adjoints.coeff(i);
-  }
-
-  if (msgs) {
-    *msgs << all_msgs.str();
-  }
-
-  return var(
-      new precomputed_gradients_vari(total_sum,
-                                     num_vars_sliced_terms + num_vars_shared_terms,
-                                     varis, partials));
-}
-  /*
-  inline var operator()(Vec&& vmapped, bool auto_partitioning, int grainsize,
+  inline var operator()(Vec&& vmapped, bool /*auto_partitioning*/, int grainsize,
                         std::ostream* msgs, Args&&... args) const {
     if (vmapped.empty()) {
       return var(0.0);
@@ -395,56 +142,109 @@ inline void operator()(std::size_t begin, std::size_t end) {
     const std::size_t num_vars_sliced_terms = num_terms * num_vars_per_term;
     const std::size_t num_vars_shared_terms = count_vars(args...);
 
-    vari** varis = ChainableStack::instance_->memalloc_.alloc_array<vari*>(
-        num_vars_sliced_terms + num_vars_shared_terms);
-    double* partials = ChainableStack::instance_->memalloc_.alloc_array<double>(
-        num_vars_sliced_terms + num_vars_shared_terms);
+    vari** varis
+        = ChainableStack::instance_->memalloc_.alloc_array<vari*>(
+            num_vars_sliced_terms + num_vars_shared_terms);
+    double* partials
+        = ChainableStack::instance_->memalloc_.alloc_array<double>(
+            num_vars_sliced_terms + num_vars_shared_terms);
 
     save_varis(varis, vmapped);
     save_varis(varis + num_vars_sliced_terms, args...);
 
-    for (size_t i = 0; i < num_vars_sliced_terms; ++i) {
+    for (std::size_t i = 0; i < num_vars_sliced_terms; ++i) {
       partials[i] = 0.0;
     }
 
-    recursive_reducer worker(num_vars_per_term, num_vars_shared_terms, partials,
-                             std::forward<Vec>(vmapped),
-                             std::forward<Args>(args)...);
-
-    // we must use task isolation as described here:
-    // https://software.intel.com/content/www/us/en/develop/documentation/tbb-documentation/top/intel-threading-building-blocks-developer-guide/task-isolation.html
-    // this is to ensure that the thread local AD tape ressource is
-    // not being modified from a different task which may happen
-    // whenever this function is being used itself in a parallel
-    // context (like running multiple chains for Stan)
-    tbb::this_task_arena::isolate([&] {
-      if (auto_partitioning) {
-        tbb::parallel_reduce(
-            tbb::blocked_range<std::size_t>(0, num_terms, grainsize), worker);
-      } else {
-        tbb::simple_partitioner partitioner;
-        tbb::parallel_deterministic_reduce(
-            tbb::blocked_range<std::size_t>(0, num_terms, grainsize), worker,
-            partitioner);
+    // Decide chunk size (grainsize) and workers.
+    auto& pool = stan::math::SimpleThreadPool::instance();
+    std::size_t pool_threads = pool.thread_count();
+    if (pool_threads == 0) pool_threads = 1;
+
+    std::size_t gs;
+    if (grainsize > 0) {
+      gs = static_cast<std::size_t>(grainsize);
+    } else {
+      // Heuristic: target ~8 chunks per worker.
+      const std::size_t target_chunks = pool_threads * 8;
+      gs = (num_terms + target_chunks - 1) / target_chunks;
+      if (gs < 1) gs = 1;
+    }
+
+    // Serial cutoffs.
+    if (pool_threads == 1 || num_terms <= gs) {
+      recursive_reducer r(num_vars_per_term, num_vars_shared_terms,
+                          partials, vmapped, args...);
+      r(0, num_terms);
+
+      for (std::size_t i = 0; i < num_vars_shared_terms; ++i) {
+        partials[num_vars_sliced_terms + i]
+            = (r.args_adjoints_.size() != 0) ? r.args_adjoints_.coeff(i) : 0.0;
+      }
+      if (msgs) {
+        *msgs << r.msgs_.str();
+      }
+
+      return var(new precomputed_gradients_vari(
+          r.sum_, num_vars_sliced_terms + num_vars_shared_terms, varis,
+          partials));
+    }
+
+    const std::size_t num_workers = std::min(pool_threads, num_terms);
+
+    // One reducer per worker (reused across chunks). This lets each worker reuse
+    // its deep-copied shared arguments and local stack.
+    std::vector<std::unique_ptr<recursive_reducer>> workers;
+    workers.reserve(num_workers);
+    for (std::size_t t = 0; t < num_workers; ++t) {
+      workers.emplace_back(std::make_unique<recursive_reducer>(
+          num_vars_per_term, num_vars_shared_terms, partials, vmapped, args...));
+    }
+
+    std::atomic<std::size_t> next{0};
+
+    // Dynamic scheduling: each worker pulls the next chunk.
+    pool.parallel_region(num_workers, [&](std::size_t tid) {
+      auto& w = *workers[tid];
+      while (true) {
+        const std::size_t begin =
+            next.fetch_add(gs, std::memory_order_relaxed);
+        if (begin >= num_terms) break;
+        const std::size_t end = std::min<std::size_t>(begin + gs, num_terms);
+        w(begin, end);
       }
     });
 
-    for (size_t i = 0; i < num_vars_shared_terms; ++i) {
-      partials[num_vars_sliced_terms + i] = worker.args_adjoints_.coeff(i);
+    // Aggregate results.
+    double total_sum = 0.0;
+    Eigen::VectorXd shared_adjoints =
+        Eigen::VectorXd::Zero(num_vars_shared_terms);
+    std::stringstream all_msgs;
+
+    for (auto& wptr : workers) {
+      auto& w = *wptr;
+      total_sum += w.sum_;
+      if (w.args_adjoints_.size() != 0) {
+        shared_adjoints += w.args_adjoints_;
+      }
+      all_msgs << w.msgs_.str();
+    }
+
+    for (std::size_t i = 0; i < num_vars_shared_terms; ++i) {
+      partials[num_vars_sliced_terms + i] = shared_adjoints.coeff(i);
     }
 
     if (msgs) {
-      *msgs << worker.msgs_.str();
+      *msgs << all_msgs.str();
     }
 
     return var(new precomputed_gradients_vari(
-        worker.sum_, num_vars_sliced_terms + num_vars_shared_terms, varis,
+        total_sum, num_vars_sliced_terms + num_vars_shared_terms, varis,
         partials));
   }
-  */
 };
-}  // namespace internal
 
+}  // namespace internal
 }  // namespace math
 }  // namespace stan
 

From 5e78da0df51d9b3f9aa620074f941e244cb66bc9 Mon Sep 17 00:00:00 2001
From: Daniel Lee <bearlee@alum.mit.edu>
Date: Thu, 4 Dec 2025 11:28:58 -0500
Subject: [PATCH 03/17] 1.2x

---
 stan/math/rev/core/team_thread_pool.hpp | 191 +++++++++++++++++++
 stan/math/rev/functor/reduce_sum.hpp    | 237 +++++++++++++-----------
 2 files changed, 315 insertions(+), 113 deletions(-)
 create mode 100644 stan/math/rev/core/team_thread_pool.hpp

diff --git a/stan/math/rev/core/team_thread_pool.hpp b/stan/math/rev/core/team_thread_pool.hpp
new file mode 100644
index 00000000000..8aa7eb419c6
--- /dev/null
+++ b/stan/math/rev/core/team_thread_pool.hpp
@@ -0,0 +1,191 @@
+#ifndef STAN_MATH_REV_CORE_TEAM_THREAD_POOL_HPP
+#define STAN_MATH_REV_CORE_TEAM_THREAD_POOL_HPP
+
+#include <stan/math/rev/core/chainablestack.hpp>
+
+#include <atomic>
+#include <condition_variable>
+#include <cstddef>
+#include <mutex>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace stan {
+namespace math {
+
+/**
+ * Team (epoch) thread pool for low-overhead parallel regions.
+ *
+ * - Creates (hw-1) worker threads once.
+ * - Caller participates with tid=0.
+ * - parallel_region(n, fn): runs fn(tid) for tid in [0, n).
+ * - Nested parallelism: if called from a worker thread, runs serial.
+ *
+ * Designed for reduce_sum/map_rect style internal parallelism.
+ */
+class TeamThreadPool {
+ public:
+  static TeamThreadPool& instance() {
+    static TeamThreadPool pool;
+    return pool;
+  }
+
+  TeamThreadPool(const TeamThreadPool&) = delete;
+  TeamThreadPool& operator=(const TeamThreadPool&) = delete;
+
+  // Number of worker threads (excluding caller)
+  std::size_t worker_count() const noexcept { return workers_.size(); }
+
+  // Total participants available = worker_count + 1 (caller)
+  std::size_t team_size() const noexcept { return workers_.size() + 1; }
+
+  template <typename F>
+  void parallel_region(std::size_t n, F&& fn) {
+    if (n == 0) return;
+
+    // If called from a worker, run serial to avoid nested deadlocks.
+    if (in_worker_) {
+      fn(std::size_t{0});
+      return;
+    }
+
+    const std::size_t max_team = team_size();
+    if (max_team == 1) {
+      fn(std::size_t{0});
+      return;
+    }
+    if (n > max_team) n = max_team;
+    if (n == 1) {
+      fn(std::size_t{0});
+      return;
+    }
+
+    // Stable storage for callable during this region
+    using Fn = std::decay_t<F>;
+    Fn fn_copy = std::forward<F>(fn);
+
+    // Publish region
+    remaining_.store(n - 1, std::memory_order_release);  // workers only
+    region_n_.store(n, std::memory_order_release);
+    region_ctx_.store(static_cast<void*>(&fn_copy), std::memory_order_release);
+    region_call_.store(&call_impl<Fn>, std::memory_order_release);
+
+    epoch_.fetch_add(1, std::memory_order_acq_rel);
+
+    // Wake workers
+    {
+      std::lock_guard<std::mutex> lk(wake_m_);
+    }
+    wake_cv_.notify_all();
+
+    // Caller participates as tid=0
+    in_worker_ = true;
+    fn_copy(0);
+    in_worker_ = false;
+
+    // Wait for workers 1..n-1
+    std::unique_lock<std::mutex> lk(done_m_);
+    done_cv_.wait(lk, [&] {
+      return remaining_.load(std::memory_order_acquire) == 0;
+    });
+  }
+
+ private:
+  using call_fn_t = void (*)(void*, std::size_t);
+
+  template <typename Fn>
+  static void call_impl(void* ctx, std::size_t tid) {
+    (*static_cast<Fn*>(ctx))(tid);
+  }
+
+  TeamThreadPool()
+      : stop_(false), epoch_(0), region_n_(0), region_ctx_(nullptr),
+        region_call_(nullptr), remaining_(0) {
+    unsigned hw = std::thread::hardware_concurrency();
+    if (hw == 0) hw = 2;
+
+    // hw-1 worker threads; caller is +1 participant.
+    const unsigned num_workers = (hw > 1) ? (hw - 1) : 1;
+
+    workers_.reserve(num_workers);
+    for (unsigned i = 0; i < num_workers; ++i) {
+      const std::size_t tid = static_cast<std::size_t>(i + 1);  // workers are 1..N
+      workers_.emplace_back([this, tid] {
+        // Per-worker AD tape initialized once
+        static thread_local ChainableStack ad_tape;
+        in_worker_ = true;
+
+        std::size_t seen = epoch_.load(std::memory_order_acquire);
+        for (;;) {
+          // Sleep until epoch changes or stop requested
+          {
+            std::unique_lock<std::mutex> lk(wake_m_);
+            wake_cv_.wait(lk, [&] {
+              return stop_.load(std::memory_order_acquire)
+                     || epoch_.load(std::memory_order_acquire) != seen;
+            });
+          }
+          if (stop_.load(std::memory_order_acquire)) break;
+
+          const std::size_t e = epoch_.load(std::memory_order_acquire);
+          seen = e;
+
+          const std::size_t n = region_n_.load(std::memory_order_acquire);
+          if (tid >= n) {
+            continue;  // not participating this region
+          }
+
+          void* ctx = region_ctx_.load(std::memory_order_acquire);
+          call_fn_t call = region_call_.load(std::memory_order_acquire);
+          if (call) {
+            call(ctx, tid);
+          }
+
+          if (remaining_.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+            std::lock_guard<std::mutex> lk(done_m_);
+            done_cv_.notify_one();
+          }
+        }
+
+        in_worker_ = false;
+      });
+    }
+  }
+
+  ~TeamThreadPool() {
+    stop_.store(true, std::memory_order_release);
+    {
+      std::lock_guard<std::mutex> lk(wake_m_);
+    }
+    wake_cv_.notify_all();
+    for (auto& t : workers_) {
+      if (t.joinable()) t.join();
+    }
+  }
+
+  static inline thread_local bool in_worker_ = false;
+
+  std::vector<std::thread> workers_;
+  std::atomic<bool> stop_;
+
+  // Region publication
+  std::atomic<std::size_t> epoch_;
+  std::atomic<std::size_t> region_n_;
+  std::atomic<void*> region_ctx_;
+  std::atomic<call_fn_t> region_call_;
+
+  // Worker wake
+  std::mutex wake_m_;
+  std::condition_variable wake_cv_;
+
+  // Completion
+  std::atomic<std::size_t> remaining_;
+  std::mutex done_m_;
+  std::condition_variable done_cv_;
+};
+
+}  // namespace math
+}  // namespace stan
+
+#endif
diff --git a/stan/math/rev/functor/reduce_sum.hpp b/stan/math/rev/functor/reduce_sum.hpp
index 73d868ae097..a41f12334f5 100644
--- a/stan/math/rev/functor/reduce_sum.hpp
+++ b/stan/math/rev/functor/reduce_sum.hpp
@@ -4,14 +4,14 @@
 #include <stan/math/prim/meta.hpp>
 #include <stan/math/prim/functor.hpp>
 #include <stan/math/rev/core.hpp>
-#include <stan/math/rev/core/simple_thread_pool.hpp>
+#include <stan/math/rev/core/team_thread_pool.hpp>
 
 #include <algorithm>
-#include <atomic>
 #include <cstddef>
 #include <memory>
 #include <sstream>
 #include <tuple>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -19,113 +19,127 @@ namespace stan {
 namespace math {
 namespace internal {
 
-/**
- * Var specialization of reduce_sum_impl
- *
- * @tparam ReduceFunction Type of reducer function
- * @tparam ReturnType Must be var
- * @tparam Vec Type of sliced argument
- * @tparam Args Types of shared arguments
- */
 template <typename ReduceFunction, typename ReturnType, typename Vec,
           typename... Args>
 struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType, Vec,
                        Args...> {
   struct scoped_args_tuple {
     ScopedChainableStack stack_;
-    using args_tuple_t
-        = std::tuple<decltype(deep_copy_vars(std::declval<Args>()))...>;
+    using args_tuple_t =
+        std::tuple<decltype(deep_copy_vars(std::declval<std::decay_t<Args>>()))...>;
     std::unique_ptr<args_tuple_t> args_tuple_holder_;
-
     scoped_args_tuple() : stack_(), args_tuple_holder_(nullptr) {}
   };
 
   struct recursive_reducer {
+    using VecRef = std::decay_t<Vec>;
+    using args_ptrs_t = std::tuple<const std::decay_t<Args>*...>;
+
+    // Apply a callable to tuple of pointers, dereferencing each pointer.
+    template <typename Fn, typename Tuple, std::size_t... I>
+    static inline decltype(auto) apply_ptr_tuple_impl(
+        Fn&& fn, Tuple& t, std::index_sequence<I...>) {
+      return std::forward<Fn>(fn)((*std::get<I>(t))...);
+    }
+    template <typename Fn, typename... Ts>
+    static inline decltype(auto) apply_ptr_tuple(Fn&& fn,
+                                                 std::tuple<const Ts*...>& t) {
+      return apply_ptr_tuple_impl(std::forward<Fn>(fn), t,
+                                  std::index_sequence_for<Ts...>{});
+    }
+
     const std::size_t num_vars_per_term_;
     const std::size_t num_vars_shared_terms_;
     double* sliced_partials_;
 
-    Vec vmapped_;
-    std::stringstream msgs_;
-    std::tuple<Args...> args_tuple_;
+    const VecRef* vmapped_;
+    args_ptrs_t args_ptrs_;
+
+    // msgs only if requested
+    std::unique_ptr<std::stringstream> msgs_;
+    std::ostream* msgs_out_;
+
     scoped_args_tuple local_args_tuple_scope_;
 
     double sum_{0.0};
-    Eigen::VectorXd args_adjoints_{0};
+    std::vector<double> args_adjoints_;  // faster than Eigen::VectorXd
+
+    // Reusable buffer to avoid realloc per chunk
+    VecRef local_sub_slice_;
+    std::size_t reserved_ = 0;
 
-    template <typename VecT, typename... ArgsT>
     recursive_reducer(std::size_t num_vars_per_term,
                       std::size_t num_vars_shared_terms,
                       double* sliced_partials,
-                      VecT&& vmapped,
-                      ArgsT&&... args)
+                      const VecRef& vmapped,
+                      std::ostream* msgs,
+                      const std::decay_t<Args>&... args)
         : num_vars_per_term_(num_vars_per_term),
           num_vars_shared_terms_(num_vars_shared_terms),
           sliced_partials_(sliced_partials),
-          vmapped_(std::forward<VecT>(vmapped)),
-          args_tuple_(std::forward<ArgsT>(args)...) {}
+          vmapped_(&vmapped),
+          args_ptrs_(&args...),
+          msgs_out_(msgs) {
+      if (msgs_out_) msgs_ = std::make_unique<std::stringstream>();
+    }
 
     inline void operator()(std::size_t begin, std::size_t end) {
-      if (begin >= end) {
-        return;
-      }
+      if (begin >= end) return;
 
-      if (args_adjoints_.size() == 0) {
-        args_adjoints_ = Eigen::VectorXd::Zero(num_vars_shared_terms_);
+      if (args_adjoints_.empty()) {
+        args_adjoints_.assign(num_vars_shared_terms_, 0.0);
       }
 
-      // Obtain reference to a local copy of all shared arguments that do not
-      // point back to main autodiff stack.
       if (!local_args_tuple_scope_.args_tuple_holder_) {
-        // First time: copy shared arguments into reducer-scoped space.
         local_args_tuple_scope_.stack_.execute([&]() {
-          math::apply(
-              [&](auto&&... args) {
-                local_args_tuple_scope_.args_tuple_holder_ =
-                    std::make_unique<typename scoped_args_tuple::args_tuple_t>(
-                        deep_copy_vars(args)...);
-              },
-              args_tuple_);
+          apply_ptr_tuple([&](auto const&... a) {
+            local_args_tuple_scope_.args_tuple_holder_ =
+                std::make_unique<typename scoped_args_tuple::args_tuple_t>(
+                    deep_copy_vars(a)...);
+          }, args_ptrs_);
         });
       } else {
-        // Subsequent calls: reset adjoints of shared arguments.
         local_args_tuple_scope_.stack_.execute([] { set_zero_all_adjoints(); });
       }
 
       auto& args_tuple_local = *(local_args_tuple_scope_.args_tuple_holder_);
 
-      // Start a nested autodiff scope for this chunk.
       const nested_rev_autodiff begin_nest;
 
-      // Copy sliced terms into this nested stack.
-      std::decay_t<Vec> local_sub_slice;
-      local_sub_slice.reserve(end - begin);
+      // Reuse per-worker buffer
+      const std::size_t n = end - begin;
+      local_sub_slice_.clear();
+      if (reserved_ < n) {
+        local_sub_slice_.reserve(n);
+        reserved_ = n;
+      }
+
       for (std::size_t i = begin; i < end; ++i) {
-        local_sub_slice.emplace_back(deep_copy_vars(vmapped_[i]));
+        local_sub_slice_.emplace_back(deep_copy_vars((*vmapped_)[i]));
       }
 
-      // Run user reducer over [begin, end-1]
+      std::ostream* local_msgs =
+          msgs_ ? static_cast<std::ostream*>(msgs_.get()) : nullptr;
+
       var sub_sum_v = math::apply(
-          [&](auto&&... args) {
-            return ReduceFunction()(local_sub_slice, begin, end - 1, &msgs_,
-                                    args...);
+          [&](auto&&... args_local) {
+            return ReduceFunction()(local_sub_slice_, begin, end - 1, local_msgs,
+                                    args_local...);
           },
           args_tuple_local);
 
-      // Compute partial derivatives within this nested stack.
       sub_sum_v.grad();
-
-      // Accumulate value.
       sum_ += sub_sum_v.val();
 
-      // Accumulate adjoints of sliced arguments into the shared partials array.
       accumulate_adjoints(sliced_partials_ + begin * num_vars_per_term_,
-                          std::move(local_sub_slice));
+                          std::move(local_sub_slice_));
+
+      // local_sub_slice_ got moved-from; restore it to a valid empty state
+      local_sub_slice_.clear();
 
-      // Accumulate adjoints of shared arguments into this reducer's buffer.
       math::apply(
-          [&](auto&&... args) {
-            accumulate_adjoints(args_adjoints_.data(), args...);
+          [&](auto&&... args_local) {
+            accumulate_adjoints(args_adjoints_.data(), args_local...);
           },
           args_tuple_local);
     }
@@ -133,114 +147,111 @@ struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType, Ve
 
   inline var operator()(Vec&& vmapped, bool /*auto_partitioning*/, int grainsize,
                         std::ostream* msgs, Args&&... args) const {
-    if (vmapped.empty()) {
-      return var(0.0);
-    }
+    if (vmapped.empty()) return var(0.0);
 
     const std::size_t num_terms = vmapped.size();
     const std::size_t num_vars_per_term = count_vars(vmapped[0]);
     const std::size_t num_vars_sliced_terms = num_terms * num_vars_per_term;
     const std::size_t num_vars_shared_terms = count_vars(args...);
 
-    vari** varis
-        = ChainableStack::instance_->memalloc_.alloc_array<vari*>(
+    vari** varis =
+        ChainableStack::instance_->memalloc_.alloc_array<vari*>(
             num_vars_sliced_terms + num_vars_shared_terms);
-    double* partials
-        = ChainableStack::instance_->memalloc_.alloc_array<double>(
+    double* partials =
+        ChainableStack::instance_->memalloc_.alloc_array<double>(
             num_vars_sliced_terms + num_vars_shared_terms);
 
     save_varis(varis, vmapped);
     save_varis(varis + num_vars_sliced_terms, args...);
 
-    for (std::size_t i = 0; i < num_vars_sliced_terms; ++i) {
-      partials[i] = 0.0;
-    }
+    for (std::size_t i = 0; i < num_vars_sliced_terms; ++i) partials[i] = 0.0;
+
+    auto& pool = stan::math::TeamThreadPool::instance();
+    const std::size_t max_team = pool.team_size();
 
-    // Decide chunk size (grainsize) and workers.
-    auto& pool = stan::math::SimpleThreadPool::instance();
-    std::size_t pool_threads = pool.thread_count();
-    if (pool_threads == 0) pool_threads = 1;
+    // Choose workers. (Caller participates, so total participants = n)
+    std::size_t n = std::min<std::size_t>(max_team, num_terms == 0 ? 1 : num_terms);
+    if (n < 1) n = 1;
 
+    // Chunking: default to ~2 chunks per participant (lower overhead).
     std::size_t gs;
     if (grainsize > 0) {
       gs = static_cast<std::size_t>(grainsize);
+      if (gs < 1) gs = 1;
     } else {
-      // Heuristic: target ~8 chunks per worker.
-      const std::size_t target_chunks = pool_threads * 8;
+      const std::size_t target_chunks = n * 2;
       gs = (num_terms + target_chunks - 1) / target_chunks;
       if (gs < 1) gs = 1;
     }
 
-    // Serial cutoffs.
-    if (pool_threads == 1 || num_terms <= gs) {
-      recursive_reducer r(num_vars_per_term, num_vars_shared_terms,
-                          partials, vmapped, args...);
+    // Serial cutoff: if too few terms, don't parallelize.
+    if (n == 1 || num_terms <= gs) {
+      recursive_reducer r(num_vars_per_term, num_vars_shared_terms, partials,
+                          vmapped, msgs, args...);
       r(0, num_terms);
 
-      for (std::size_t i = 0; i < num_vars_shared_terms; ++i) {
-        partials[num_vars_sliced_terms + i]
-            = (r.args_adjoints_.size() != 0) ? r.args_adjoints_.coeff(i) : 0.0;
-      }
-      if (msgs) {
-        *msgs << r.msgs_.str();
+      // write shared adjoints
+      if (!r.args_adjoints_.empty()) {
+        for (std::size_t i = 0; i < num_vars_shared_terms; ++i) {
+          partials[num_vars_sliced_terms + i] = r.args_adjoints_[i];
+        }
+      } else {
+        for (std::size_t i = 0; i < num_vars_shared_terms; ++i) {
+          partials[num_vars_sliced_terms + i] = 0.0;
+        }
       }
 
+      if (msgs && r.msgs_) *msgs << r.msgs_->str();
+
       return var(new precomputed_gradients_vari(
-          r.sum_, num_vars_sliced_terms + num_vars_shared_terms, varis,
-          partials));
+          r.sum_, num_vars_sliced_terms + num_vars_shared_terms, varis, partials));
     }
 
-    const std::size_t num_workers = std::min(pool_threads, num_terms);
-
-    // One reducer per worker (reused across chunks). This lets each worker reuse
-    // its deep-copied shared arguments and local stack.
+    // One reducer per participant (0..n-1) for static partitioning.
+    // NOTE: we avoid copying vmapped/args by taking references/pointers inside reducer.
     std::vector<std::unique_ptr<recursive_reducer>> workers;
-    workers.reserve(num_workers);
-    for (std::size_t t = 0; t < num_workers; ++t) {
+    workers.reserve(n);
+    for (std::size_t tid = 0; tid < n; ++tid) {
       workers.emplace_back(std::make_unique<recursive_reducer>(
-          num_vars_per_term, num_vars_shared_terms, partials, vmapped, args...));
+          num_vars_per_term, num_vars_shared_terms, partials,
+          vmapped, msgs, args...));
     }
 
-    std::atomic<std::size_t> next{0};
-
-    // Dynamic scheduling: each worker pulls the next chunk.
-    pool.parallel_region(num_workers, [&](std::size_t tid) {
-      auto& w = *workers[tid];
-      while (true) {
-        const std::size_t begin =
-            next.fetch_add(gs, std::memory_order_relaxed);
-        if (begin >= num_terms) break;
-        const std::size_t end = std::min<std::size_t>(begin + gs, num_terms);
-        w(begin, end);
+    // Static partition: each participant gets a contiguous block once
+    pool.parallel_region(n, [&](std::size_t tid) {
+      const std::size_t b0 = (num_terms * tid) / n;
+      const std::size_t b1 = (num_terms * (tid + 1)) / n;
+      if (b0 < b1) {
+        (*workers[tid])(b0, b1);
       }
     });
 
-    // Aggregate results.
+    // Aggregate
     double total_sum = 0.0;
-    Eigen::VectorXd shared_adjoints =
-        Eigen::VectorXd::Zero(num_vars_shared_terms);
+    std::vector<double> shared_adj(num_vars_shared_terms, 0.0);
     std::stringstream all_msgs;
 
     for (auto& wptr : workers) {
       auto& w = *wptr;
       total_sum += w.sum_;
-      if (w.args_adjoints_.size() != 0) {
-        shared_adjoints += w.args_adjoints_;
+      if (!w.args_adjoints_.empty()) {
+        for (std::size_t i = 0; i < num_vars_shared_terms; ++i) {
+          shared_adj[i] += w.args_adjoints_[i];
+        }
+      }
+      if (msgs && w.msgs_) {
+        all_msgs << w.msgs_->str();
       }
-      all_msgs << w.msgs_.str();
     }
 
     for (std::size_t i = 0; i < num_vars_shared_terms; ++i) {
-      partials[num_vars_sliced_terms + i] = shared_adjoints.coeff(i);
+      partials[num_vars_sliced_terms + i] = shared_adj[i];
     }
 
-    if (msgs) {
-      *msgs << all_msgs.str();
-    }
+    if (msgs) *msgs << all_msgs.str();
 
     return var(new precomputed_gradients_vari(
-        total_sum, num_vars_sliced_terms + num_vars_shared_terms, varis,
-        partials));
+        total_sum, num_vars_sliced_terms + num_vars_shared_terms, varis, partials));
   }
 };
 

From a124301f018efb1850f1e9865b42cc1dfcbb4775 Mon Sep 17 00:00:00 2001
From: Daniel Lee <bearlee@alum.mit.edu>
Date: Mon, 8 Dec 2025 11:07:28 -0500
Subject: [PATCH 04/17] almost working version

---
 stan/math/rev/core.hpp                  |  1 +
 stan/math/rev/core/team_thread_pool.hpp | 87 +++++++++++++++++++++++--
 stan/math/rev/functor/reduce_sum.hpp    |  9 ++-
 3 files changed, 89 insertions(+), 8 deletions(-)

diff --git a/stan/math/rev/core.hpp b/stan/math/rev/core.hpp
index 9b7646c7d04..0dd91e8d9a6 100644
--- a/stan/math/rev/core.hpp
+++ b/stan/math/rev/core.hpp
@@ -63,6 +63,7 @@
 #include <stan/math/rev/core/std_isnan.hpp>
 #include <stan/math/rev/core/std_numeric_limits.hpp>
 #include <stan/math/rev/core/stored_gradient_vari.hpp>
+#include <stan/math/rev/core/team_thread_pool.hpp>
 #include <stan/math/rev/core/typedefs.hpp>
 #include <stan/math/rev/core/var.hpp>
 #include <stan/math/rev/core/vari.hpp>
diff --git a/stan/math/rev/core/team_thread_pool.hpp b/stan/math/rev/core/team_thread_pool.hpp
index 8aa7eb419c6..8ccbad64b08 100644
--- a/stan/math/rev/core/team_thread_pool.hpp
+++ b/stan/math/rev/core/team_thread_pool.hpp
@@ -26,6 +26,16 @@ namespace math {
  */
 class TeamThreadPool {
  public:
+  // Call this before first use of TeamThreadPool::instance()
+  static void set_num_threads(std::size_t n) noexcept {
+    if (n < 1) n = 1;
+    user_cap_().store(n, std::memory_order_release);
+  }
+
+  static std::size_t get_num_threads() noexcept {
+    return user_cap_().load(std::memory_order_acquire);
+  }
+
   static TeamThreadPool& instance() {
     static TeamThreadPool pool;
     return pool;
@@ -42,14 +52,15 @@ class TeamThreadPool {
 
   template <typename F>
   void parallel_region(std::size_t n, F&& fn) {
+    //std::cout << "#################### parallel_region, n = " << n << std::endl;
     if (n == 0) return;
 
     // If called from a worker, run serial to avoid nested deadlocks.
+    //std::cout << "in_worker_ = " << in_worker_ << std::endl;
     if (in_worker_) {
       fn(std::size_t{0});
       return;
     }
-
     const std::size_t max_team = team_size();
     if (max_team == 1) {
       fn(std::size_t{0});
@@ -84,14 +95,36 @@ class TeamThreadPool {
     fn_copy(0);
     in_worker_ = false;
 
+    //std::cout << "waiting for workers" << std::endl;
     // Wait for workers 1..n-1
     std::unique_lock<std::mutex> lk(done_m_);
     done_cv_.wait(lk, [&] {
       return remaining_.load(std::memory_order_acquire) == 0;
     });
+    //std::cout << "#################### done" << std::endl << std::endl;
+  }
+
+private:
+  // Function-local static avoids static init order fiasco.
+  static std::atomic<std::size_t>& user_cap_() {
+    static std::atomic<std::size_t> cap{0};  // 0 => "unset"
+    return cap;
+  }
+
+  static std::size_t configured_cap_(std::size_t hw) {
+    // priority: user cap > env var > hw
+    std::size_t cap = user_cap_().load(std::memory_order_acquire);
+    if (cap == 0) {
+      cap = env_num_threads_();   // if you have STAN_NUM_THREADS support
+    }
+    if (cap == 0) cap = hw;
+
+    if (cap < 1) cap = 1;
+    if (cap > hw) cap = hw;       // prevent oversubscription by default
+    return cap;
   }
 
- private:
+  
   using call_fn_t = void (*)(void*, std::size_t);
 
   template <typename Fn>
@@ -99,15 +132,53 @@ class TeamThreadPool {
     (*static_cast<Fn*>(ctx))(tid);
   }
 
+  static size_t env_num_threads_() {
+    size_t num_threads = 1;
+#ifdef STAN_THREADS
+    const char* env_stan_num_threads = std::getenv("STAN_NUM_THREADS");
+    if (env_stan_num_threads != nullptr) {
+      try {
+	const int env_num_threads
+          = boost::lexical_cast<int>(env_stan_num_threads);
+	if (env_num_threads > 0) {
+	  num_threads = env_num_threads;
+	} else if (env_num_threads == -1) {
+	  num_threads = std::thread::hardware_concurrency();
+	} else {
+	  invalid_argument("get_num_threads(int)", "STAN_NUM_THREADS",
+			   env_stan_num_threads,
+			   "The STAN_NUM_THREADS environment variable is '",
+			   "' but it must be positive or -1");
+	}
+      } catch (const boost::bad_lexical_cast&) {
+	invalid_argument("get_num_threads(int)", "STAN_NUM_THREADS",
+			 env_stan_num_threads,
+			 "The STAN_NUM_THREADS environment variable is '",
+			 "' but it must be a positive number or -1");
+      }
+    }
+#endif
+  return num_threads;
+}
+
+
   TeamThreadPool()
       : stop_(false), epoch_(0), region_n_(0), region_ctx_(nullptr),
         region_call_(nullptr), remaining_(0) {
-    unsigned hw = std::thread::hardware_concurrency();
-    if (hw == 0) hw = 2;
-
-    // hw-1 worker threads; caller is +1 participant.
-    const unsigned num_workers = (hw > 1) ? (hw - 1) : 1;
 
+    unsigned hw_u = std::thread::hardware_concurrency();
+    if (hw_u == 0) hw_u = 2;
+    const std::size_t hw = static_cast<std::size_t>(hw_u);
+    
+    const std::size_t cap = configured_cap_(hw);
+    const std::size_t num_workers = (cap > 1) ? (cap - 1) : 0;
+
+    std::cout << "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" << std::endl
+	      << "hw = " << hw << std::endl 
+	      << "num_workers = " << num_workers << std::endl
+	      << "cap = " << cap << std::endl
+	      << std::endl << std::endl;
+    
     workers_.reserve(num_workers);
     for (unsigned i = 0; i < num_workers; ++i) {
       const std::size_t tid = static_cast<std::size_t>(i + 1);  // workers are 1..N
@@ -151,7 +222,9 @@ class TeamThreadPool {
         in_worker_ = false;
       });
     }
+    std::cout << "done with constructor" << std::endl;
   }
+  
 
   ~TeamThreadPool() {
     stop_.store(true, std::memory_order_release);
diff --git a/stan/math/rev/functor/reduce_sum.hpp b/stan/math/rev/functor/reduce_sum.hpp
index a41f12334f5..f3b9dd0136c 100644
--- a/stan/math/rev/functor/reduce_sum.hpp
+++ b/stan/math/rev/functor/reduce_sum.hpp
@@ -216,7 +216,14 @@ struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType, Ve
           num_vars_per_term, num_vars_shared_terms, partials,
           vmapped, msgs, args...));
     }
-
+    /*
+    std::cout << "--------------------------------------------------------------------------------" << std::endl
+	      << "worker count = " << pool.worker_count() << std::endl
+	      << "team size = " << pool.team_size() << std::endl
+	      << "gs = " << gs << std::endl
+	      << std::endl << std::endl;
+    */
+    
     // Static partition: each participant gets a contiguous block once
     pool.parallel_region(n, [&](std::size_t tid) {
       const std::size_t b0 = (num_terms * tid) / n;

From 99c269bc059cb30fc1d1423324d62c90a4079dcb Mon Sep 17 00:00:00 2001
From: Daniel Lee <bearlee@alum.mit.edu>
Date: Mon, 8 Dec 2025 11:24:58 -0500
Subject: [PATCH 05/17] better implementation, still hangs

---
 stan/math/rev/core/team_thread_pool.hpp | 192 +++++++++++++-----------
 1 file changed, 101 insertions(+), 91 deletions(-)

diff --git a/stan/math/rev/core/team_thread_pool.hpp b/stan/math/rev/core/team_thread_pool.hpp
index 8ccbad64b08..b4894b96841 100644
--- a/stan/math/rev/core/team_thread_pool.hpp
+++ b/stan/math/rev/core/team_thread_pool.hpp
@@ -6,8 +6,11 @@
 #include <atomic>
 #include <condition_variable>
 #include <cstddef>
+#include <cstdlib>
+#include <exception>
 #include <mutex>
 #include <thread>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -17,16 +20,16 @@ namespace math {
 /**
  * Team (epoch) thread pool for low-overhead parallel regions.
  *
- * - Creates (hw-1) worker threads once.
- * - Caller participates with tid=0.
+ * - Workers are created once.
+ * - Caller participates as tid=0.
  * - parallel_region(n, fn): runs fn(tid) for tid in [0, n).
  * - Nested parallelism: if called from a worker thread, runs serial.
- *
- * Designed for reduce_sum/map_rect style internal parallelism.
+ * - set_num_threads(k) must be called before instance() to size the pool.
  */
 class TeamThreadPool {
  public:
-  // Call this before first use of TeamThreadPool::instance()
+  // Call before first instance() to control pool size.
+  // Meaning: total participants INCLUDING caller (tid=0).
   static void set_num_threads(std::size_t n) noexcept {
     if (n < 1) n = 1;
     user_cap_().store(n, std::memory_order_release);
@@ -41,26 +44,25 @@ class TeamThreadPool {
     return pool;
   }
 
-  TeamThreadPool(const TeamThreadPool&) = delete;
-  TeamThreadPool& operator=(const TeamThreadPool&) = delete;
-
-  // Number of worker threads (excluding caller)
+  // Worker threads (excluding caller)
   std::size_t worker_count() const noexcept { return workers_.size(); }
 
-  // Total participants available = worker_count + 1 (caller)
+  // Total possible participants INCLUDING caller
   std::size_t team_size() const noexcept { return workers_.size() + 1; }
 
   template <typename F>
   void parallel_region(std::size_t n, F&& fn) {
-    //std::cout << "#################### parallel_region, n = " << n << std::endl;
     if (n == 0) return;
 
-    // If called from a worker, run serial to avoid nested deadlocks.
-    //std::cout << "in_worker_ = " << in_worker_ << std::endl;
+    // Nested parallelism guard: if already on a worker, run serial.
     if (in_worker_) {
       fn(std::size_t{0});
       return;
     }
+
+    // Only one active region at a time (this is required for a single shared epoch design).
+    std::unique_lock<std::mutex> region_lock(region_m_);
+
     const std::size_t max_team = team_size();
     if (max_team == 1) {
       fn(std::size_t{0});
@@ -72,11 +74,17 @@ class TeamThreadPool {
       return;
     }
 
-    // Stable storage for callable during this region
     using Fn = std::decay_t<F>;
-    Fn fn_copy = std::forward<F>(fn);
+    Fn fn_copy = std::forward<F>(fn);  // stable storage during this call
+
+    // Exception propagation: capture first exception from any participant.
+    std::exception_ptr eptr = nullptr;
+    {
+      std::lock_guard<std::mutex> lk(exc_m_);
+      exc_ptr_ = &eptr;
+    }
 
-    // Publish region
+    // Publish region state BEFORE bumping epoch.
     remaining_.store(n - 1, std::memory_order_release);  // workers only
     region_n_.store(n, std::memory_order_release);
     region_ctx_.store(static_cast<void*>(&fn_copy), std::memory_order_release);
@@ -92,39 +100,28 @@ class TeamThreadPool {
 
     // Caller participates as tid=0
     in_worker_ = true;
-    fn_copy(0);
+    try {
+      fn_copy(0);
+    } catch (...) {
+      std::lock_guard<std::mutex> lk(exc_m_);
+      if (eptr == nullptr) eptr = std::current_exception();
+    }
     in_worker_ = false;
 
-    //std::cout << "waiting for workers" << std::endl;
     // Wait for workers 1..n-1
     std::unique_lock<std::mutex> lk(done_m_);
     done_cv_.wait(lk, [&] {
       return remaining_.load(std::memory_order_acquire) == 0;
     });
-    //std::cout << "#################### done" << std::endl << std::endl;
-  }
-
-private:
-  // Function-local static avoids static init order fiasco.
-  static std::atomic<std::size_t>& user_cap_() {
-    static std::atomic<std::size_t> cap{0};  // 0 => "unset"
-    return cap;
-  }
 
-  static std::size_t configured_cap_(std::size_t hw) {
-    // priority: user cap > env var > hw
-    std::size_t cap = user_cap_().load(std::memory_order_acquire);
-    if (cap == 0) {
-      cap = env_num_threads_();   // if you have STAN_NUM_THREADS support
-    }
-    if (cap == 0) cap = hw;
+    // Clear region participation; not strictly necessary but good hygiene.
+    region_n_.store(0, std::memory_order_release);
 
-    if (cap < 1) cap = 1;
-    if (cap > hw) cap = hw;       // prevent oversubscription by default
-    return cap;
+    // Rethrow exception (if any)
+    if (eptr) std::rethrow_exception(eptr);
   }
 
-  
+ private:
   using call_fn_t = void (*)(void*, std::size_t);
 
   template <typename Fn>
@@ -132,64 +129,57 @@ class TeamThreadPool {
     (*static_cast<Fn*>(ctx))(tid);
   }
 
-  static size_t env_num_threads_() {
-    size_t num_threads = 1;
-#ifdef STAN_THREADS
-    const char* env_stan_num_threads = std::getenv("STAN_NUM_THREADS");
-    if (env_stan_num_threads != nullptr) {
-      try {
-	const int env_num_threads
-          = boost::lexical_cast<int>(env_stan_num_threads);
-	if (env_num_threads > 0) {
-	  num_threads = env_num_threads;
-	} else if (env_num_threads == -1) {
-	  num_threads = std::thread::hardware_concurrency();
-	} else {
-	  invalid_argument("get_num_threads(int)", "STAN_NUM_THREADS",
-			   env_stan_num_threads,
-			   "The STAN_NUM_THREADS environment variable is '",
-			   "' but it must be positive or -1");
-	}
-      } catch (const boost::bad_lexical_cast&) {
-	invalid_argument("get_num_threads(int)", "STAN_NUM_THREADS",
-			 env_stan_num_threads,
-			 "The STAN_NUM_THREADS environment variable is '",
-			 "' but it must be a positive number or -1");
-      }
-    }
-#endif
-  return num_threads;
-}
+  // Function-local static avoids static initialization order issues.
+  static std::atomic<std::size_t>& user_cap_() {
+    static std::atomic<std::size_t> cap{0};  // 0 => unset
+    return cap;
+  }
 
+  static std::size_t env_num_threads_() noexcept {
+    const char* s = std::getenv("STAN_NUM_THREADS");
+    if (!s || !*s) return 0;
+    char* end = nullptr;
+    long v = std::strtol(s, &end, 10);
+    if (end == s || v <= 0) return 0;
+    return static_cast<std::size_t>(v);
+  }
 
-  TeamThreadPool()
-      : stop_(false), epoch_(0), region_n_(0), region_ctx_(nullptr),
-        region_call_(nullptr), remaining_(0) {
+  static std::size_t configured_cap_(std::size_t hw) noexcept {
+    // Priority: explicit set_num_threads > STAN_NUM_THREADS > hw
+    std::size_t cap = user_cap_().load(std::memory_order_acquire);
+    if (cap == 0) cap = env_num_threads_();
+    if (cap == 0) cap = hw;
 
+    if (cap < 1) cap = 1;
+    if (cap > hw) cap = hw;  // don’t oversubscribe by default
+    return cap;
+  }
+
+  TeamThreadPool()
+      : stop_(false),
+        epoch_(0),
+        region_n_(0),
+        region_ctx_(nullptr),
+        region_call_(nullptr),
+        remaining_(0),
+        exc_ptr_(nullptr) {
     unsigned hw_u = std::thread::hardware_concurrency();
     if (hw_u == 0) hw_u = 2;
     const std::size_t hw = static_cast<std::size_t>(hw_u);
-    
+
+    // Total participants includes caller.
     const std::size_t cap = configured_cap_(hw);
     const std::size_t num_workers = (cap > 1) ? (cap - 1) : 0;
 
-    std::cout << "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" << std::endl
-	      << "hw = " << hw << std::endl 
-	      << "num_workers = " << num_workers << std::endl
-	      << "cap = " << cap << std::endl
-	      << std::endl << std::endl;
-    
     workers_.reserve(num_workers);
-    for (unsigned i = 0; i < num_workers; ++i) {
-      const std::size_t tid = static_cast<std::size_t>(i + 1);  // workers are 1..N
+    for (std::size_t i = 0; i < num_workers; ++i) {
+      const std::size_t tid = i + 1;  // workers are 1..num_workers
       workers_.emplace_back([this, tid] {
-        // Per-worker AD tape initialized once
         static thread_local ChainableStack ad_tape;
         in_worker_ = true;
 
         std::size_t seen = epoch_.load(std::memory_order_acquire);
         for (;;) {
-          // Sleep until epoch changes or stop requested
           {
             std::unique_lock<std::mutex> lk(wake_m_);
             wake_cv_.wait(lk, [&] {
@@ -203,28 +193,41 @@ class TeamThreadPool {
           seen = e;
 
           const std::size_t n = region_n_.load(std::memory_order_acquire);
-          if (tid >= n) {
-            continue;  // not participating this region
-          }
+          if (tid >= n) continue;  // not participating this region
+
+          // Ensure we ALWAYS decrement remaining_ once for participating workers.
+          struct DoneGuard {
+            std::atomic<std::size_t>& rem;
+            std::mutex& m;
+            std::condition_variable& cv;
+            bool active{true};
+            ~DoneGuard() {
+              if (!active) return;
+              if (rem.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+                std::lock_guard<std::mutex> lk(m);
+                cv.notify_one();
+              }
+            }
+          } guard{remaining_, done_m_, done_cv_};
 
           void* ctx = region_ctx_.load(std::memory_order_acquire);
           call_fn_t call = region_call_.load(std::memory_order_acquire);
-          if (call) {
-            call(ctx, tid);
-          }
+          if (!call) continue;
 
-          if (remaining_.fetch_sub(1, std::memory_order_acq_rel) == 1) {
-            std::lock_guard<std::mutex> lk(done_m_);
-            done_cv_.notify_one();
+          try {
+            call(ctx, tid);
+          } catch (...) {
+            std::lock_guard<std::mutex> lk(exc_m_);
+            if (exc_ptr_ && *exc_ptr_ == nullptr) {
+              *exc_ptr_ = std::current_exception();
+            }
           }
         }
 
         in_worker_ = false;
       });
     }
-    std::cout << "done with constructor" << std::endl;
   }
-  
 
   ~TeamThreadPool() {
     stop_.store(true, std::memory_order_release);
@@ -242,6 +245,9 @@ class TeamThreadPool {
   std::vector<std::thread> workers_;
   std::atomic<bool> stop_;
 
+  // Serialize regions (single shared-region design)
+  std::mutex region_m_;
+
   // Region publication
   std::atomic<std::size_t> epoch_;
   std::atomic<std::size_t> region_n_;
@@ -256,6 +262,10 @@ class TeamThreadPool {
   std::atomic<std::size_t> remaining_;
   std::mutex done_m_;
   std::condition_variable done_cv_;
+
+  // Exception plumbing
+  std::mutex exc_m_;
+  std::exception_ptr* exc_ptr_;
 };
 
 }  // namespace math

From 152610169b937338c80627c74b93b2694e07af7d Mon Sep 17 00:00:00 2001
From: Daniel Lee <bearlee@alum.mit.edu>
Date: Fri, 12 Dec 2025 09:21:53 -0500
Subject: [PATCH 06/17] updating team_thread_pool.hpp

---
 stan/math/rev/core/team_thread_pool.hpp | 267 +++++++++++++++++++-----
 1 file changed, 213 insertions(+), 54 deletions(-)

diff --git a/stan/math/rev/core/team_thread_pool.hpp b/stan/math/rev/core/team_thread_pool.hpp
index b4894b96841..a3726d5e95a 100644
--- a/stan/math/rev/core/team_thread_pool.hpp
+++ b/stan/math/rev/core/team_thread_pool.hpp
@@ -6,30 +6,47 @@
 #include <atomic>
 #include <condition_variable>
 #include <cstddef>
-#include <cstdlib>
-#include <exception>
+#include <cstdio>     // fprintf, fflush
+#include <cstdlib>    // getenv, strtol
+#include <exception>  // exception_ptr
 #include <mutex>
 #include <thread>
 #include <type_traits>
 #include <utility>
 #include <vector>
 
+// Debug mode: periodic dumps while waiting.
+// #define STAN_TEAM_POOL_DEBUG_WAIT 1
+
+#if defined(STAN_TEAM_POOL_DEBUG_WAIT)
+#include <chrono>
+#endif
+
 namespace stan {
 namespace math {
 
 /**
  * Team (epoch) thread pool for low-overhead parallel regions.
  *
- * - Workers are created once.
- * - Caller participates as tid=0.
- * - parallel_region(n, fn): runs fn(tid) for tid in [0, n).
- * - Nested parallelism: if called from a worker thread, runs serial.
- * - set_num_threads(k) must be called before instance() to size the pool.
+ * Logical tids:
+ * - caller thread: tid=0
+ * - worker threads: tid=1..cap-1
+ *
+ * Key correctness points:
+ * - Single shared "region" state => serialize parallel_region() with region_m_.
+ * - Wake generation (wake_gen_) protected by wake_m_ to prevent missed wakeups.
+ * - Startup barrier ensures all workers are "armed" on wake_gen_ before first use,
+ *   preventing late-start workers from missing the first region.
+ *
+ * Debug fields per logical tid:
+ * - alive[tid]    : 1 once the worker thread started (0 means never started / died early)
+ * - seen[tid]     : last epoch observed by that worker (0 means never saw any region)
+ * - exec[tid]     : epoch currently executing (0 means not executing region work)
+ * - dec[tid]      : count of decrements performed by that worker
  */
 class TeamThreadPool {
  public:
-  // Call before first instance() to control pool size.
-  // Meaning: total participants INCLUDING caller (tid=0).
+  // Total participants INCLUDING caller (tid=0). Call before instance().
   static void set_num_threads(std::size_t n) noexcept {
     if (n < 1) n = 1;
     user_cap_().store(n, std::memory_order_release);
@@ -44,23 +61,58 @@ class TeamThreadPool {
     return pool;
   }
 
-  // Worker threads (excluding caller)
   std::size_t worker_count() const noexcept { return workers_.size(); }
-
-  // Total possible participants INCLUDING caller
   std::size_t team_size() const noexcept { return workers_.size() + 1; }
 
+  void dump_state(const char* tag = "TeamThreadPool") const {
+    const auto epoch = epoch_.load(std::memory_order_acquire);
+    const auto n = region_n_.load(std::memory_order_acquire);
+    const auto rem = remaining_.load(std::memory_order_acquire);
+
+    std::fprintf(stderr,
+                 "\n[%s] epoch=%zu region_n=%zu remaining=%zu team_size=%zu wake_gen=%zu ready=%zu/%zu\n",
+                 tag, epoch, n, rem, team_size(), wake_gen_snapshot_(),
+                 ready_count_.load(std::memory_order_acquire),
+                 workers_.size());
+    std::fprintf(stderr, "  tid | alive |  seen |  exec | dec | note\n");
+    std::fprintf(stderr, "  ----+-------+-------+-------+-----+-----------------------------\n");
+
+    for (std::size_t tid = 1; tid < worker_state_size_; ++tid) {
+      const unsigned alive = worker_alive_[tid].load(std::memory_order_acquire);
+      const std::size_t seen = worker_seen_epoch_[tid].load(std::memory_order_acquire);
+      const std::size_t exec = worker_exec_epoch_[tid].load(std::memory_order_acquire);
+      const std::size_t dec = worker_decrement_count_[tid].load(std::memory_order_acquire);
+
+      const char* note = "";
+      if (!alive) {
+        note = "NOT ALIVE";
+      } else if (tid < n) {
+        if (seen < epoch) note = "participating but hasn't seen epoch";
+        else if (exec == epoch) note = "executing";
+        else if (exec != 0) note = "executing (old epoch?)";
+        else note = "idle/finished";
+      } else {
+        if (seen == epoch) note = "saw epoch but not participating";
+        else note = "not participating";
+      }
+
+      std::fprintf(stderr, "  %3zu |   %u   | %5zu | %5zu | %3zu | %s\n",
+                   tid, alive, seen, exec, dec, note);
+    }
+    std::fflush(stderr);
+  }
+
   template <typename F>
   void parallel_region(std::size_t n, F&& fn) {
     if (n == 0) return;
 
-    // Nested parallelism guard: if already on a worker, run serial.
+    // Nested parallelism guard.
     if (in_worker_) {
       fn(std::size_t{0});
       return;
     }
 
-    // Only one active region at a time (this is required for a single shared epoch design).
+    // Single shared region state => serialize launches.
     std::unique_lock<std::mutex> region_lock(region_m_);
 
     const std::size_t max_team = team_size();
@@ -75,9 +127,9 @@ class TeamThreadPool {
     }
 
     using Fn = std::decay_t<F>;
-    Fn fn_copy = std::forward<F>(fn);  // stable storage during this call
+    Fn fn_copy = std::forward<F>(fn);
 
-    // Exception propagation: capture first exception from any participant.
+    // Exception propagation (first exception wins).
     std::exception_ptr eptr = nullptr;
     {
       std::lock_guard<std::mutex> lk(exc_m_);
@@ -85,20 +137,27 @@ class TeamThreadPool {
     }
 
     // Publish region state BEFORE bumping epoch.
-    remaining_.store(n - 1, std::memory_order_release);  // workers only
+    remaining_.store(n - 1, std::memory_order_release);
     region_n_.store(n, std::memory_order_release);
     region_ctx_.store(static_cast<void*>(&fn_copy), std::memory_order_release);
     region_call_.store(&call_impl<Fn>, std::memory_order_release);
 
-    epoch_.fetch_add(1, std::memory_order_acq_rel);
+    const std::size_t new_epoch =
+        epoch_.fetch_add(1, std::memory_order_acq_rel) + 1;
 
-    // Wake workers
+    // std::fprintf(stderr,
+    //              "\n[TeamThreadPool(launch)] epoch=%zu n=%zu expected_workers=%zu team_size=%zu\n",
+    //              new_epoch, n, n - 1, team_size());
+    // std::fflush(stderr);
+
+    // Wake workers using wake generation (prevents missed wakeups).
     {
       std::lock_guard<std::mutex> lk(wake_m_);
+      ++wake_gen_;
     }
     wake_cv_.notify_all();
 
-    // Caller participates as tid=0
+    // Caller participates (tid=0).
     in_worker_ = true;
     try {
       fn_copy(0);
@@ -108,16 +167,31 @@ class TeamThreadPool {
     }
     in_worker_ = false;
 
-    // Wait for workers 1..n-1
+    // Wait for workers 1..n-1.
     std::unique_lock<std::mutex> lk(done_m_);
+#if defined(STAN_TEAM_POOL_DEBUG_WAIT)
+    auto last_dump = std::chrono::steady_clock::now();
+    while (remaining_.load(std::memory_order_acquire) != 0) {
+      done_cv_.wait_for(lk, std::chrono::milliseconds(250));
+      const auto now = std::chrono::steady_clock::now();
+      if (now - last_dump > std::chrono::seconds(2)
+          && remaining_.load(std::memory_order_acquire) != 0) {
+        std::fprintf(stderr,
+                     "[TeamThreadPool] waiting too long for epoch=%zu (remaining=%zu)\n",
+                     new_epoch, remaining_.load(std::memory_order_acquire));
+        dump_state("TeamThreadPool(wait)");
+        last_dump = now;
+      }
+    }
+#else
     done_cv_.wait(lk, [&] {
       return remaining_.load(std::memory_order_acquire) == 0;
     });
+#endif
 
-    // Clear region participation; not strictly necessary but good hygiene.
+    // Hygiene.
     region_n_.store(0, std::memory_order_release);
 
-    // Rethrow exception (if any)
     if (eptr) std::rethrow_exception(eptr);
   }
 
@@ -129,9 +203,8 @@ class TeamThreadPool {
     (*static_cast<Fn*>(ctx))(tid);
   }
 
-  // Function-local static avoids static initialization order issues.
   static std::atomic<std::size_t>& user_cap_() {
-    static std::atomic<std::size_t> cap{0};  // 0 => unset
+    static std::atomic<std::size_t> cap{0};
     return cap;
   }
 
@@ -145,16 +218,19 @@ class TeamThreadPool {
   }
 
   static std::size_t configured_cap_(std::size_t hw) noexcept {
-    // Priority: explicit set_num_threads > STAN_NUM_THREADS > hw
     std::size_t cap = user_cap_().load(std::memory_order_acquire);
     if (cap == 0) cap = env_num_threads_();
     if (cap == 0) cap = hw;
-
     if (cap < 1) cap = 1;
-    if (cap > hw) cap = hw;  // don’t oversubscribe by default
+    if (cap > hw) cap = hw;
     return cap;
   }
 
+  std::size_t wake_gen_snapshot_() const {
+    std::lock_guard<std::mutex> lk(wake_m_);
+    return wake_gen_;
+  }
+
   TeamThreadPool()
       : stop_(false),
         epoch_(0),
@@ -162,79 +238,149 @@ class TeamThreadPool {
         region_ctx_(nullptr),
         region_call_(nullptr),
         remaining_(0),
-        exc_ptr_(nullptr) {
+        exc_ptr_(nullptr),
+        wake_gen_(0),
+        ready_count_(0) {
     unsigned hw_u = std::thread::hardware_concurrency();
     if (hw_u == 0) hw_u = 2;
     const std::size_t hw = static_cast<std::size_t>(hw_u);
 
-    // Total participants includes caller.
     const std::size_t cap = configured_cap_(hw);
     const std::size_t num_workers = (cap > 1) ? (cap - 1) : 0;
 
+    worker_state_size_ = cap;
+
+    // raw arrays so atomics aren't moved
+    worker_alive_.reset(new std::atomic<unsigned>[cap]);
+    worker_seen_epoch_.reset(new std::atomic<std::size_t>[cap]);
+    worker_exec_epoch_.reset(new std::atomic<std::size_t>[cap]);
+    worker_decrement_count_.reset(new std::atomic<std::size_t>[cap]);
+
+    for (std::size_t i = 0; i < cap; ++i) {
+      worker_alive_[i].store(0u, std::memory_order_relaxed);
+      worker_seen_epoch_[i].store(0, std::memory_order_relaxed);
+      worker_exec_epoch_[i].store(0, std::memory_order_relaxed);
+      worker_decrement_count_[i].store(0, std::memory_order_relaxed);
+    }
+
+    std::fprintf(stderr,
+                 "[TeamThreadPool(ctor)] cap=%zu (workers=%zu) hw=%zu\n",
+                 cap, num_workers, hw);
+    std::fflush(stderr);
+
     workers_.reserve(num_workers);
     for (std::size_t i = 0; i < num_workers; ++i) {
-      const std::size_t tid = i + 1;  // workers are 1..num_workers
+      const std::size_t tid = i + 1;
       workers_.emplace_back([this, tid] {
         static thread_local ChainableStack ad_tape;
         in_worker_ = true;
 
-        std::size_t seen = epoch_.load(std::memory_order_acquire);
+        if (tid < worker_state_size_) {
+          worker_alive_[tid].store(1u, std::memory_order_release);
+        }
+
+        // "Arm" this worker on the current wake generation so it can't miss
+        // the first region wake.
+        std::size_t local_gen;
+        {
+          std::unique_lock<std::mutex> lk(wake_m_);
+          local_gen = wake_gen_;
+        }
+
+        // Signal readiness AFTER arming.
+        ready_count_.fetch_add(1, std::memory_order_acq_rel);
+        {
+          std::lock_guard<std::mutex> lk(ready_m_);
+        }
+        ready_cv_.notify_one();
+
         for (;;) {
+          // Wait for wake_gen_ to change (or stop_).
           {
             std::unique_lock<std::mutex> lk(wake_m_);
             wake_cv_.wait(lk, [&] {
               return stop_.load(std::memory_order_acquire)
-                     || epoch_.load(std::memory_order_acquire) != seen;
+                     || wake_gen_ != local_gen;
             });
+            if (stop_.load(std::memory_order_acquire)) break;
+            local_gen = wake_gen_;
           }
-          if (stop_.load(std::memory_order_acquire)) break;
 
+          // Observe epoch and region parameters after wake.
           const std::size_t e = epoch_.load(std::memory_order_acquire);
-          seen = e;
+
+          if (tid < worker_state_size_) {
+            worker_seen_epoch_[tid].store(e, std::memory_order_release);
+          }
 
           const std::size_t n = region_n_.load(std::memory_order_acquire);
-          if (tid >= n) continue;  // not participating this region
+          if (tid >= n) {
+            continue;
+          }
 
-          // Ensure we ALWAYS decrement remaining_ once for participating workers.
+          // Always decrement once for participating workers.
           struct DoneGuard {
             std::atomic<std::size_t>& rem;
             std::mutex& m;
             std::condition_variable& cv;
-            bool active{true};
+            std::atomic<std::size_t>& dec_count;
             ~DoneGuard() {
-              if (!active) return;
+              dec_count.fetch_add(1, std::memory_order_relaxed);
               if (rem.fetch_sub(1, std::memory_order_acq_rel) == 1) {
                 std::lock_guard<std::mutex> lk(m);
                 cv.notify_one();
               }
             }
-          } guard{remaining_, done_m_, done_cv_};
+          } guard{remaining_, done_m_, done_cv_, worker_decrement_count_[tid]};
+
+          if (tid < worker_state_size_) {
+            worker_exec_epoch_[tid].store(e, std::memory_order_release);
+          }
 
           void* ctx = region_ctx_.load(std::memory_order_acquire);
           call_fn_t call = region_call_.load(std::memory_order_acquire);
-          if (!call) continue;
-
-          try {
-            call(ctx, tid);
-          } catch (...) {
-            std::lock_guard<std::mutex> lk(exc_m_);
-            if (exc_ptr_ && *exc_ptr_ == nullptr) {
-              *exc_ptr_ = std::current_exception();
+
+          if (call) {
+            try {
+              call(ctx, tid);
+            } catch (...) {
+              std::lock_guard<std::mutex> lk(exc_m_);
+              if (exc_ptr_ && *exc_ptr_ == nullptr) {
+                *exc_ptr_ = std::current_exception();
+              }
             }
           }
+
+          if (tid < worker_state_size_) {
+            worker_exec_epoch_[tid].store(0, std::memory_order_release);
+          }
         }
 
         in_worker_ = false;
       });
     }
+
+    // Startup barrier: ensure all workers are armed and waiting-ready.
+    {
+      std::unique_lock<std::mutex> lk(ready_m_);
+      ready_cv_.wait(lk, [&] {
+        return ready_count_.load(std::memory_order_acquire) == workers_.size();
+      });
+    }
+    std::fprintf(stderr, "[TeamThreadPool(ctor)] all workers ready: %zu\n",
+                 workers_.size());
+    std::fflush(stderr);
   }
 
   ~TeamThreadPool() {
     stop_.store(true, std::memory_order_release);
     {
+      // bump wake_gen_ so workers wake and see stop_
       std::lock_guard<std::mutex> lk(wake_m_);
+      ++wake_gen_;
     }
     wake_cv_.notify_all();
+
     for (auto& t : workers_) {
       if (t.joinable()) t.join();
     }
@@ -245,27 +391,40 @@ class TeamThreadPool {
   std::vector<std::thread> workers_;
   std::atomic<bool> stop_;
 
-  // Serialize regions (single shared-region design)
+  // Serialize regions.
   std::mutex region_m_;
 
-  // Region publication
+  // Region publication.
   std::atomic<std::size_t> epoch_;
   std::atomic<std::size_t> region_n_;
   std::atomic<void*> region_ctx_;
   std::atomic<call_fn_t> region_call_;
 
-  // Worker wake
-  std::mutex wake_m_;
+  // Wake workers (wake_gen_ is protected by wake_m_).
+  mutable std::mutex wake_m_;
   std::condition_variable wake_cv_;
+  std::size_t wake_gen_;
 
-  // Completion
+  // Startup barrier.
+  std::mutex ready_m_;
+  std::condition_variable ready_cv_;
+  std::atomic<std::size_t> ready_count_;
+
+  // Completion.
   std::atomic<std::size_t> remaining_;
   std::mutex done_m_;
   std::condition_variable done_cv_;
 
-  // Exception plumbing
+  // Exceptions.
   std::mutex exc_m_;
   std::exception_ptr* exc_ptr_;
+
+  // Debug state (arrays of atomics to avoid std::vector<atomic<...>> move issues).
+  std::size_t worker_state_size_{0};
+  std::unique_ptr<std::atomic<unsigned>[]> worker_alive_;
+  std::unique_ptr<std::atomic<std::size_t>[]> worker_seen_epoch_;
+  std::unique_ptr<std::atomic<std::size_t>[]> worker_exec_epoch_;
+  std::unique_ptr<std::atomic<std::size_t>[]> worker_decrement_count_;
 };
 
 }  // namespace math

From 02589232b4002e8063ef05a0142ebd6b99ee793e Mon Sep 17 00:00:00 2001
From: Daniel Lee <bearlee@alum.mit.edu>
Date: Fri, 12 Dec 2025 20:09:23 -0500
Subject: [PATCH 07/17] updates to team_thread_pool.hpp

---
 stan/math/rev/core/team_thread_pool.hpp | 212 +++++-------------------
 1 file changed, 38 insertions(+), 174 deletions(-)

diff --git a/stan/math/rev/core/team_thread_pool.hpp b/stan/math/rev/core/team_thread_pool.hpp
index a3726d5e95a..538006bde81 100644
--- a/stan/math/rev/core/team_thread_pool.hpp
+++ b/stan/math/rev/core/team_thread_pool.hpp
@@ -6,7 +6,6 @@
 #include <atomic>
 #include <condition_variable>
 #include <cstddef>
-#include <cstdio>     // fprintf, fflush
 #include <cstdlib>    // getenv, strtol
 #include <exception>  // exception_ptr
 #include <mutex>
@@ -15,34 +14,21 @@
 #include <utility>
 #include <vector>
 
-// Debug mode: periodic dumps while waiting.
-// #define STAN_TEAM_POOL_DEBUG_WAIT 1
-
-#if defined(STAN_TEAM_POOL_DEBUG_WAIT)
-#include <chrono>
-#endif
-
 namespace stan {
 namespace math {
 
 /**
- * Team (epoch) thread pool for low-overhead parallel regions.
+ * TeamThreadPool
  *
- * Logical tids:
- * - caller thread: tid=0
- * - worker threads: tid=1..cap-1
+ * - Fixed set of worker threads created once.
+ * - Caller participates as logical tid=0.
+ * - Worker threads have stable logical tids 1..(cap-1).
+ * - parallel_region(n, fn): runs fn(tid) for tid in [0, n), exactly once each.
  *
- * Key correctness points:
- * - Single shared "region" state => serialize parallel_region() with region_m_.
- * - Wake generation (wake_gen_) protected by wake_m_ to prevent missed wakeups.
- * - Startup barrier ensures all workers are "armed" on wake_gen_ before first use,
- *   preventing late-start workers from missing the first region.
- *
- * Debug fields per logical tid:
- * - alive[tid]    : 1 once the worker thread started (0 means never started / died early)
- * - seen[tid]     : last epoch observed by that worker (0 means never saw any region)
- * - exec[tid]     : epoch currently executing (0 means not executing region work)
- * - dec[tid]      : count of decrements performed by that worker
+ * Notes:
+ * - Nested parallel_region calls from a worker run serially to avoid deadlock.
+ * - Uses an epoch counter + condition_variable to wake workers per region.
+ * - Startup barrier ensures all workers are waiting before the first region launch.
  */
 class TeamThreadPool {
  public:
@@ -64,59 +50,21 @@ class TeamThreadPool {
   std::size_t worker_count() const noexcept { return workers_.size(); }
   std::size_t team_size() const noexcept { return workers_.size() + 1; }
 
-  void dump_state(const char* tag = "TeamThreadPool") const {
-    const auto epoch = epoch_.load(std::memory_order_acquire);
-    const auto n = region_n_.load(std::memory_order_acquire);
-    const auto rem = remaining_.load(std::memory_order_acquire);
-
-    std::fprintf(stderr,
-                 "\n[%s] epoch=%zu region_n=%zu remaining=%zu team_size=%zu wake_gen=%zu ready=%zu/%zu\n",
-                 tag, epoch, n, rem, team_size(), wake_gen_snapshot_(),
-                 ready_count_.load(std::memory_order_acquire),
-                 workers_.size());
-    std::fprintf(stderr, "  tid | alive |  seen |  exec | dec | note\n");
-    std::fprintf(stderr, "  ----+-------+-------+-------+-----+-----------------------------\n");
-
-    for (std::size_t tid = 1; tid < worker_state_size_; ++tid) {
-      const unsigned alive = worker_alive_[tid].load(std::memory_order_acquire);
-      const std::size_t seen = worker_seen_epoch_[tid].load(std::memory_order_acquire);
-      const std::size_t exec = worker_exec_epoch_[tid].load(std::memory_order_acquire);
-      const std::size_t dec = worker_decrement_count_[tid].load(std::memory_order_acquire);
-
-      const char* note = "";
-      if (!alive) {
-        note = "NOT ALIVE";
-      } else if (tid < n) {
-        if (seen < epoch) note = "participating but hasn't seen epoch";
-        else if (exec == epoch) note = "executing";
-        else if (exec != 0) note = "executing (old epoch?)";
-        else note = "idle/finished";
-      } else {
-        if (seen == epoch) note = "saw epoch but not participating";
-        else note = "not participating";
-      }
-
-      std::fprintf(stderr, "  %3zu |   %u   | %5zu | %5zu | %3zu | %s\n",
-                   tid, alive, seen, exec, dec, note);
-    }
-    std::fflush(stderr);
-  }
-
   template <typename F>
   void parallel_region(std::size_t n, F&& fn) {
     if (n == 0) return;
 
-    // Nested parallelism guard.
+    // Prevent nested parallelism from deadlocking the pool.
     if (in_worker_) {
       fn(std::size_t{0});
       return;
     }
 
-    // Single shared region state => serialize launches.
+    // Only one active region at a time (shared region state).
     std::unique_lock<std::mutex> region_lock(region_m_);
 
     const std::size_t max_team = team_size();
-    if (max_team == 1) {
+    if (max_team <= 1) {
       fn(std::size_t{0});
       return;
     }
@@ -137,27 +85,23 @@ class TeamThreadPool {
     }
 
     // Publish region state BEFORE bumping epoch.
-    remaining_.store(n - 1, std::memory_order_release);
+    remaining_.store(n - 1, std::memory_order_release);  // workers only
     region_n_.store(n, std::memory_order_release);
     region_ctx_.store(static_cast<void*>(&fn_copy), std::memory_order_release);
     region_call_.store(&call_impl<Fn>, std::memory_order_release);
 
+    // Bump epoch to start the region, then wake workers.
     const std::size_t new_epoch =
         epoch_.fetch_add(1, std::memory_order_acq_rel) + 1;
 
-    // std::fprintf(stderr,
-    //              "\n[TeamThreadPool(launch)] epoch=%zu n=%zu expected_workers=%zu team_size=%zu\n",
-    //              new_epoch, n, n - 1, team_size());
-    // std::fflush(stderr);
-
-    // Wake workers using wake generation (prevents missed wakeups).
     {
       std::lock_guard<std::mutex> lk(wake_m_);
-      ++wake_gen_;
+      // epoch_ already updated; the mutex pairs with the cv wait.
+      (void)new_epoch;
     }
     wake_cv_.notify_all();
 
-    // Caller participates (tid=0).
+    // Caller participates as tid=0.
     in_worker_ = true;
     try {
       fn_copy(0);
@@ -169,25 +113,9 @@ class TeamThreadPool {
 
     // Wait for workers 1..n-1.
     std::unique_lock<std::mutex> lk(done_m_);
-#if defined(STAN_TEAM_POOL_DEBUG_WAIT)
-    auto last_dump = std::chrono::steady_clock::now();
-    while (remaining_.load(std::memory_order_acquire) != 0) {
-      done_cv_.wait_for(lk, std::chrono::milliseconds(250));
-      const auto now = std::chrono::steady_clock::now();
-      if (now - last_dump > std::chrono::seconds(2)
-          && remaining_.load(std::memory_order_acquire) != 0) {
-        std::fprintf(stderr,
-                     "[TeamThreadPool] waiting too long for epoch=%zu (remaining=%zu)\n",
-                     new_epoch, remaining_.load(std::memory_order_acquire));
-        dump_state("TeamThreadPool(wait)");
-        last_dump = now;
-      }
-    }
-#else
     done_cv_.wait(lk, [&] {
       return remaining_.load(std::memory_order_acquire) == 0;
     });
-#endif
 
     // Hygiene.
     region_n_.store(0, std::memory_order_release);
@@ -204,7 +132,7 @@ class TeamThreadPool {
   }
 
   static std::atomic<std::size_t>& user_cap_() {
-    static std::atomic<std::size_t> cap{0};
+    static std::atomic<std::size_t> cap{0};  // 0 => unset
     return cap;
   }
 
@@ -226,11 +154,6 @@ class TeamThreadPool {
     return cap;
   }
 
-  std::size_t wake_gen_snapshot_() const {
-    std::lock_guard<std::mutex> lk(wake_m_);
-    return wake_gen_;
-  }
-
   TeamThreadPool()
       : stop_(false),
         epoch_(0),
@@ -239,7 +162,6 @@ class TeamThreadPool {
         region_call_(nullptr),
         remaining_(0),
         exc_ptr_(nullptr),
-        wake_gen_(0),
         ready_count_(0) {
     unsigned hw_u = std::thread::hardware_concurrency();
     if (hw_u == 0) hw_u = 2;
@@ -248,94 +170,52 @@ class TeamThreadPool {
     const std::size_t cap = configured_cap_(hw);
     const std::size_t num_workers = (cap > 1) ? (cap - 1) : 0;
 
-    worker_state_size_ = cap;
-
-    // raw arrays so atomics aren't moved
-    worker_alive_.reset(new std::atomic<unsigned>[cap]);
-    worker_seen_epoch_.reset(new std::atomic<std::size_t>[cap]);
-    worker_exec_epoch_.reset(new std::atomic<std::size_t>[cap]);
-    worker_decrement_count_.reset(new std::atomic<std::size_t>[cap]);
-
-    for (std::size_t i = 0; i < cap; ++i) {
-      worker_alive_[i].store(0u, std::memory_order_relaxed);
-      worker_seen_epoch_[i].store(0, std::memory_order_relaxed);
-      worker_exec_epoch_[i].store(0, std::memory_order_relaxed);
-      worker_decrement_count_[i].store(0, std::memory_order_relaxed);
-    }
-
-    std::fprintf(stderr,
-                 "[TeamThreadPool(ctor)] cap=%zu (workers=%zu) hw=%zu\n",
-                 cap, num_workers, hw);
-    std::fflush(stderr);
-
     workers_.reserve(num_workers);
     for (std::size_t i = 0; i < num_workers; ++i) {
-      const std::size_t tid = i + 1;
+      const std::size_t tid = i + 1;  // workers are 1..num_workers
       workers_.emplace_back([this, tid] {
+        // Per-worker AD tape initialized once.
         static thread_local ChainableStack ad_tape;
-        in_worker_ = true;
-
-        if (tid < worker_state_size_) {
-          worker_alive_[tid].store(1u, std::memory_order_release);
-        }
+        (void)ad_tape;
 
-        // "Arm" this worker on the current wake generation so it can't miss
-        // the first region wake.
-        std::size_t local_gen;
-        {
-          std::unique_lock<std::mutex> lk(wake_m_);
-          local_gen = wake_gen_;
-        }
+        in_worker_ = true;
 
-        // Signal readiness AFTER arming.
-        ready_count_.fetch_add(1, std::memory_order_acq_rel);
+        // Startup barrier: ensure each worker has entered the wait loop once.
         {
-          std::lock_guard<std::mutex> lk(ready_m_);
+          std::lock_guard<std::mutex> lk(wake_m_);
+          ready_count_.fetch_add(1, std::memory_order_acq_rel);
         }
         ready_cv_.notify_one();
 
+        std::size_t seen_epoch = epoch_.load(std::memory_order_acquire);
+
         for (;;) {
-          // Wait for wake_gen_ to change (or stop_).
+          // Wait for a new epoch (or stop).
           {
             std::unique_lock<std::mutex> lk(wake_m_);
             wake_cv_.wait(lk, [&] {
               return stop_.load(std::memory_order_acquire)
-                     || wake_gen_ != local_gen;
+                     || epoch_.load(std::memory_order_acquire) != seen_epoch;
             });
             if (stop_.load(std::memory_order_acquire)) break;
-            local_gen = wake_gen_;
-          }
-
-          // Observe epoch and region parameters after wake.
-          const std::size_t e = epoch_.load(std::memory_order_acquire);
-
-          if (tid < worker_state_size_) {
-            worker_seen_epoch_[tid].store(e, std::memory_order_release);
+            seen_epoch = epoch_.load(std::memory_order_acquire);
           }
 
           const std::size_t n = region_n_.load(std::memory_order_acquire);
-          if (tid >= n) {
-            continue;
-          }
+          if (tid >= n) continue;  // not participating this region
 
           // Always decrement once for participating workers.
           struct DoneGuard {
             std::atomic<std::size_t>& rem;
             std::mutex& m;
             std::condition_variable& cv;
-            std::atomic<std::size_t>& dec_count;
             ~DoneGuard() {
-              dec_count.fetch_add(1, std::memory_order_relaxed);
               if (rem.fetch_sub(1, std::memory_order_acq_rel) == 1) {
                 std::lock_guard<std::mutex> lk(m);
                 cv.notify_one();
               }
             }
-          } guard{remaining_, done_m_, done_cv_, worker_decrement_count_[tid]};
-
-          if (tid < worker_state_size_) {
-            worker_exec_epoch_[tid].store(e, std::memory_order_release);
-          }
+          } guard{remaining_, done_m_, done_cv_};
 
           void* ctx = region_ctx_.load(std::memory_order_acquire);
           call_fn_t call = region_call_.load(std::memory_order_acquire);
@@ -350,34 +230,27 @@ class TeamThreadPool {
               }
             }
           }
-
-          if (tid < worker_state_size_) {
-            worker_exec_epoch_[tid].store(0, std::memory_order_release);
-          }
         }
 
         in_worker_ = false;
       });
     }
 
-    // Startup barrier: ensure all workers are armed and waiting-ready.
+    // Wait for all workers to reach the wait loop once before returning.
     {
-      std::unique_lock<std::mutex> lk(ready_m_);
+      std::unique_lock<std::mutex> lk(wake_m_);
       ready_cv_.wait(lk, [&] {
         return ready_count_.load(std::memory_order_acquire) == workers_.size();
       });
     }
-    std::fprintf(stderr, "[TeamThreadPool(ctor)] all workers ready: %zu\n",
-                 workers_.size());
-    std::fflush(stderr);
   }
 
   ~TeamThreadPool() {
     stop_.store(true, std::memory_order_release);
     {
-      // bump wake_gen_ so workers wake and see stop_
       std::lock_guard<std::mutex> lk(wake_m_);
-      ++wake_gen_;
+      // bump epoch to ensure wake predicate flips
+      epoch_.fetch_add(1, std::memory_order_acq_rel);
     }
     wake_cv_.notify_all();
 
@@ -400,13 +273,11 @@ class TeamThreadPool {
   std::atomic<void*> region_ctx_;
   std::atomic<call_fn_t> region_call_;
 
-  // Wake workers (wake_gen_ is protected by wake_m_).
-  mutable std::mutex wake_m_;
+  // Wake workers.
+  std::mutex wake_m_;
   std::condition_variable wake_cv_;
-  std::size_t wake_gen_;
 
   // Startup barrier.
-  std::mutex ready_m_;
   std::condition_variable ready_cv_;
   std::atomic<std::size_t> ready_count_;
 
@@ -418,13 +289,6 @@ class TeamThreadPool {
   // Exceptions.
   std::mutex exc_m_;
   std::exception_ptr* exc_ptr_;
-
-  // Debug state (arrays of atomics to avoid std::vector<atomic<...>> move issues).
-  std::size_t worker_state_size_{0};
-  std::unique_ptr<std::atomic<unsigned>[]> worker_alive_;
-  std::unique_ptr<std::atomic<std::size_t>[]> worker_seen_epoch_;
-  std::unique_ptr<std::atomic<std::size_t>[]> worker_exec_epoch_;
-  std::unique_ptr<std::atomic<std::size_t>[]> worker_decrement_count_;
 };
 
 }  // namespace math

From 1cc87bb4b14e27d51908c8c0a323d62d1348c20e Mon Sep 17 00:00:00 2001
From: Daniel Lee <bearlee@alum.mit.edu>
Date: Fri, 12 Dec 2025 21:12:42 -0500
Subject: [PATCH 08/17] updating make

---
 make/compiler_flags | 3 ---
 make/standalone     | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/make/compiler_flags b/make/compiler_flags
index 0febc5342de..f66d4687506 100644
--- a/make/compiler_flags
+++ b/make/compiler_flags
@@ -303,9 +303,6 @@ ifneq ($(OS), Windows_NT)
   LDFLAGS_TBB_RPATH ?= -Wl,-rpath,"$(TBB_LIB)"
 endif
 
-LDFLAGS_TBB ?= -Wl,-L,"$(TBB_LIB)" $(LDFLAGS_TBB_DTAGS) $(LDFLAGS_TBB_RPATH)
-
-LDLIBS_TBB ?= -ltbb
 
 else
 
diff --git a/make/standalone b/make/standalone
index 1dfc9203af1..ca6ffc5b4fd 100644
--- a/make/standalone
+++ b/make/standalone
@@ -22,7 +22,7 @@ MATH ?= $(realpath $(MATH_MAKE)..)/
 # The sundials libraries are only needed for
 # programs using the stiff ode solver or the
 # algebra solver
-MATH_LIBS ?= $(SUNDIALS_TARGETS) $(MPI_TARGETS) $(TBB_TARGETS)
+MATH_LIBS ?= $(SUNDIALS_TARGETS) $(MPI_TARGETS)
 
 LDLIBS += $(MATH_LIBS)
 

From 1ada1de9d0bbff236ca3eac263d5df7b04e6cd97 Mon Sep 17 00:00:00 2001
From: Daniel Lee <bearlee@alum.mit.edu>
Date: Tue, 23 Dec 2025 12:03:22 -0500
Subject: [PATCH 09/17] map_rect

---
 stan/math/rev/functor/map_rect_concurrent.hpp | 53 +++++++------------
 1 file changed, 20 insertions(+), 33 deletions(-)

diff --git a/stan/math/rev/functor/map_rect_concurrent.hpp b/stan/math/rev/functor/map_rect_concurrent.hpp
index 4e32543adaf..b87344f5c7e 100644
--- a/stan/math/rev/functor/map_rect_concurrent.hpp
+++ b/stan/math/rev/functor/map_rect_concurrent.hpp
@@ -7,9 +7,10 @@
 #include <stan/math/prim/functor/map_rect_reduce.hpp>
 #include <stan/math/prim/functor/map_rect_combine.hpp>
 #include <stan/math/rev/core/chainablestack.hpp>
+#include <stan/math/rev/core/team_thread_pool.hpp>
 
-#include <tbb/parallel_for.h>
-#include <tbb/blocked_range.h>
+//#include <tbb/parallel_for.h>
+//#include <tbb/blocked_range.h>
 
 #include <algorithm>
 #include <numeric>
@@ -34,7 +35,7 @@ map_rect_concurrent(
       = map_rect_reduce<F, scalar_type_t<T_shared_param>, T_job_param>;
   using CombineF = map_rect_combine<F, T_shared_param, T_job_param>;
 
-  const int num_jobs = job_params.size();
+  const std::size_t num_jobs = job_params.size();
   const vector_d shared_params_dbl = value_of(shared_params);
   std::vector<matrix_d> job_output(num_jobs);
   std::vector<int> world_f_out(num_jobs, 0);
@@ -48,39 +49,25 @@ map_rect_concurrent(
   };
 
 #ifdef STAN_THREADS
-  std::cout << "********************************************************************************" << std::endl;
-  if (num_jobs > 1) {
-    // simple chunked threading over [0, num_jobs)
-    unsigned hw_threads = std::thread::hardware_concurrency();
-    if (hw_threads == 0) {
-      hw_threads = 2;  // arbitrary but > 0
-    }
-
-    const unsigned max_threads
-        = static_cast<unsigned>(std::min<std::size_t>(hw_threads, num_jobs));
-    std::cout << "max_threads = " << max_threads << std::endl;
-    std::vector<std::thread> threads;
-    threads.reserve(max_threads);
-
-    const std::size_t chunk
-        = (num_jobs + max_threads - 1) / max_threads;  // ceil
-
-    for (unsigned t = 0; t < max_threads; ++t) {
-      const std::size_t start = t * chunk;
-      if (start >= num_jobs) break;
-      const std::size_t end
-          = std::min<std::size_t>(start + chunk, num_jobs);
+  auto& pool = stan::math::TeamThreadPool::instance();
 
-      threads.emplace_back([&, start, end] {
-        execute_chunk(start, end);
-      });
-    }
+  // Total participants includes caller (tid=0).
+  const std::size_t max_team = pool.team_size();
+  const std::size_t n = std::min<std::size_t>(max_team,
+                                              num_jobs == 0 ? 1u
+					      : num_jobs);
 
-    for (auto& th : threads) {
-      th.join();
-    }
-  } else {
+  if (n <= 1 || num_jobs <= 1) {
     execute_chunk(0, num_jobs);
+  } else {
+    pool.parallel_region(n, [&](std::size_t tid) {
+      const std::size_t nj = num_jobs;
+      const std::size_t b0 = (nj * tid) / n;
+      const std::size_t b1 = (nj * (tid + 1)) / n;
+      if (b0 < b1) {
+        execute_chunk(b0, b1);
+      }
+    });
   }
 #else
   execute_chunk(0, num_jobs);

From 28a32e82d0ceb5add78750ab439fd3f139ac0749 Mon Sep 17 00:00:00 2001
From: Daniel Lee <bearlee@alum.mit.edu>
Date: Tue, 23 Dec 2025 14:39:09 -0500
Subject: [PATCH 10/17] adding concurrent_vector

---
 stan/math/opencl/concurrent_vector.hpp | 226 +++++++++++++++++++++++++
 stan/math/opencl/kernel_cl.hpp         |   8 +-
 stan/math/opencl/matrix_cl.hpp         |  12 +-
 stan/math/opencl/opencl_context.hpp    |   4 +-
 4 files changed, 238 insertions(+), 12 deletions(-)
 create mode 100644 stan/math/opencl/concurrent_vector.hpp

diff --git a/stan/math/opencl/concurrent_vector.hpp b/stan/math/opencl/concurrent_vector.hpp
new file mode 100644
index 00000000000..5daef1d8f7c
--- /dev/null
+++ b/stan/math/opencl/concurrent_vector.hpp
@@ -0,0 +1,226 @@
+#ifndef STAN_MATH_OPENCL_CONCURRENT_VECTOR_HPP
+#define STAN_MATH_OPENCL_CONCURRENT_VECTOR_HPP
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <new>
+#include <type_traits>
+#include <utility>
+#include <vector>
+#include <stdexcept>
+#include <cassert>
+
+namespace stan {
+namespace math {
+namespace internal {
+
+  /**
+   * Minimal segmented concurrent_vector.
+   *
+   * Key properties:
+   *  - concurrent emplace_back/push_back using an atomic size counter.
+   *  - segmented storage => no moving elements during growth, stable addresses.
+   *  - segments allocated lazily; allocation uses CAS to avoid locks.
+   *
+   * Important constraints / notes:
+   *  - operator[] is safe if you only read indices < size() that are known to be constructed.
+   *  - For "publish-then-read" correctness: size() is updated before construction finishes.
+   *    So consumers must not read index i just because i < size(); they must have a stronger
+   *    protocol (e.g., producer hands out index, or you add a "constructed" bitmap).
+   *    This matches common usage where only the pushing thread uses the returned index.
+   *  - clear()/destruction are NOT concurrent with pushes.
+   *
+   * If you need "readers can iterate up to size() safely while writers push",
+   * add a constructed flag per element (see comment near emplace_back()).
+   */
+  template <typename T,
+	    std::size_t BaseSegmentSize = 1024,
+	    std::size_t MaxSegments = 32>
+  class concurrent_vector {
+    static_assert(BaseSegmentSize > 0, "BaseSegmentSize must be > 0");
+    static_assert((BaseSegmentSize & (BaseSegmentSize - 1)) == 0,
+		  "BaseSegmentSize must be a power of two (helps mapping).");
+  public:
+    concurrent_vector() : size_(0) {
+      segments_.resize(MaxSegments);
+      for (auto& p : segments_) p.store(nullptr, std::memory_order_relaxed);
+    }
+
+    concurrent_vector(const concurrent_vector&) = delete;
+    concurrent_vector& operator=(const concurrent_vector&) = delete;
+
+    ~concurrent_vector() { destroy_all_(); }
+
+    std::size_t size() const noexcept {
+      return size_.load(std::memory_order_acquire);
+    }
+
+    bool empty() const noexcept { return size() == 0; }
+
+    // Non-concurrent: safe only when no other threads are pushing/reading.
+    void clear() {
+      destroy_all_();
+      size_.store(0, std::memory_order_release);
+    }
+
+    // Concurrent append (construct in place). Returns the index.
+    template <typename... Args>
+    std::size_t emplace_back(Args&&... args) {
+      // Claim an index
+      const std::size_t idx = size_.fetch_add(1, std::memory_order_acq_rel);
+
+      // Ensure the segment exists
+      T* seg = ensure_segment_for_index_(idx);
+
+      // Placement-new into the correct slot
+      const std::size_t off = offset_in_segment_(idx);
+      T* slot = seg + off;
+
+      // Construct element
+      ::new (static_cast<void*>(slot)) T(std::forward<Args>(args)...);
+
+      // If you need "safe iteration by other threads that use size()",
+      // you must publish construction completion separately, e.g.:
+      // constructed_[idx].store(true, release);
+      // and readers check constructed_[i].load(acquire).
+      return idx;
+    }
+
+    std::size_t push_back(const T& v) { return emplace_back(v); }
+    std::size_t push_back(T&& v) { return emplace_back(std::move(v)); }
+
+    // Returns pointer to element at i (no bounds check).
+    // Safe if element i is fully constructed and lifetime is valid.
+    T* data_at(std::size_t i) noexcept {
+      T* seg = segment_ptr_(segment_index_(i));
+      return seg + offset_in_segment_(i);
+    }
+    const T* data_at(std::size_t i) const noexcept {
+      const T* seg = segment_ptr_(segment_index_(i));
+      return seg + offset_in_segment_(i);
+    }
+
+    // Bounds-checked access (still not concurrent-safe unless you have a protocol).
+    T& at(std::size_t i) {
+      if (i >= size()) throw std::out_of_range("concurrent_vector::at");
+      return *data_at(i);
+    }
+    const T& at(std::size_t i) const {
+      if (i >= size()) throw std::out_of_range("concurrent_vector::at");
+      return *data_at(i);
+    }
+
+    // Unchecked access
+    T& operator[](std::size_t i) noexcept { return *data_at(i); }
+    const T& operator[](std::size_t i) const noexcept { return *data_at(i); }
+
+    // Capacity is segmented and unbounded until MaxSegments is exceeded.
+    // This is the max number of elements representable by the segment scheme.
+    static constexpr std::size_t max_size() noexcept {
+      // Total capacity = Base * (2^MaxSegments - 1)
+      // but beware overflow for large MaxSegments.
+      return BaseSegmentSize * ((std::size_t{1} << MaxSegments) - 1);
+    }
+
+  private:
+    // Segment k has size BaseSegmentSize * 2^k
+    static constexpr std::size_t segment_size_(std::size_t k) noexcept {
+      return BaseSegmentSize << k;
+    }
+
+    // Prefix count before segment k:
+    // Base * (2^k - 1)
+    static constexpr std::size_t segment_prefix_(std::size_t k) noexcept {
+      return BaseSegmentSize * ((std::size_t{1} << k) - 1);
+    }
+
+    // Map global index -> segment index.
+    // Let q = idx / Base. Then segment = floor(log2(q + 1)).
+    static std::size_t segment_index_(std::size_t idx) noexcept {
+      const std::size_t q = idx / BaseSegmentSize;
+      const std::size_t x = q + 1;
+
+#if defined(__GNUG__) || defined(__clang__)
+      // floor(log2(x)) via clz
+      return (sizeof(std::size_t) * 8 - 1) - static_cast<std::size_t>(__builtin_clzl(x));
+#else
+      // portable fallback
+      std::size_t s = 0;
+      std::size_t t = x;
+      while (t >>= 1) ++s;
+      return s;
+#endif
+    }
+
+    static std::size_t offset_in_segment_(std::size_t idx) noexcept {
+      const std::size_t s = segment_index_(idx);
+      return idx - segment_prefix_(s);
+    }
+
+    T* segment_ptr_(std::size_t s) noexcept {
+      return static_cast<T*>(segments_[s].load(std::memory_order_acquire));
+    }
+    const T* segment_ptr_(std::size_t s) const noexcept {
+      return static_cast<const T*>(segments_[s].load(std::memory_order_acquire));
+    }
+
+    T* ensure_segment_for_index_(std::size_t idx) {
+      const std::size_t s = segment_index_(idx);
+      if (s >= MaxSegments) {
+	throw std::length_error("concurrent_vector: exceeded MaxSegments");
+      }
+
+      T* seg = segment_ptr_(s);
+      if (seg) return seg;
+
+      // Allocate segment lazily (raw storage for T objects)
+      const std::size_t n = segment_size_(s);
+      void* raw = ::operator new(sizeof(T) * n);
+      T* fresh = static_cast<T*>(raw);
+
+      // CAS install; if another thread won, free ours.
+      void* expected = nullptr;
+      if (!segments_[s].compare_exchange_strong(
+						expected, fresh,
+						std::memory_order_release,
+						std::memory_order_acquire)) {
+	::operator delete(raw);
+	seg = static_cast<T*>(segments_[s].load(std::memory_order_acquire));
+	assert(seg != nullptr);
+	return seg;
+      }
+
+      return fresh;
+    }
+
+    // Destroy constructed elements and free segments.
+    // Not concurrent with pushes or reads.
+    void destroy_all_() noexcept {
+      const std::size_t n = size_.load(std::memory_order_acquire);
+
+      // Destroy elements that were constructed.
+      // NOTE: This assumes indices [0, n) are all constructed.
+      // If you allow exceptions or partial construction, track constructed flags.
+      for (std::size_t i = 0; i < n; ++i) {
+	data_at(i)->~T();
+      }
+
+      // Free segments
+      for (std::size_t s = 0; s < segments_.size(); ++s) {
+	void* p = segments_[s].load(std::memory_order_acquire);
+	if (p) {
+	  ::operator delete(p);
+	  segments_[s].store(nullptr, std::memory_order_relaxed);
+	}
+      }
+    }
+
+    std::atomic<std::size_t> size_;
+    std::vector<std::atomic<void*>> segments_;
+  };
+}
+}
+}
+
+#endif
diff --git a/stan/math/opencl/kernel_cl.hpp b/stan/math/opencl/kernel_cl.hpp
index 8f0b6a66e7d..8905243d709 100644
--- a/stan/math/opencl/kernel_cl.hpp
+++ b/stan/math/opencl/kernel_cl.hpp
@@ -109,17 +109,17 @@ inline void assign_events(const cl::Event& new_event, CallArg& m,
  * @return A vector of OpenCL events.
  */
 template <typename T, require_not_matrix_cl_t<T>* = nullptr>
-inline tbb::concurrent_vector<cl::Event> select_events(const T& m) {
-  return tbb::concurrent_vector<cl::Event>{};
+inline internal::concurrent_vector<cl::Event> select_events(const T& m) {
+  return internal::concurrent_vector<cl::Event>{};
 }
 template <typename T, typename K, require_matrix_cl_t<K>* = nullptr,
           require_same_t<T, in_buffer>* = nullptr>
-inline const tbb::concurrent_vector<cl::Event>& select_events(const K& m) {
+inline const internal::concurrent_vector<cl::Event>& select_events(const K& m) {
   return m.write_events();
 }
 template <typename T, typename K, require_matrix_cl_t<K>* = nullptr,
           require_any_same_t<T, out_buffer, in_out_buffer>* = nullptr>
-inline tbb::concurrent_vector<cl::Event> select_events(K& m) {
+inline internal::concurrent_vector<cl::Event> select_events(K& m) {
   static_assert(!std::is_const<K>::value, "Can not write to const matrix_cl!");
   return m.read_write_events();
 }
diff --git a/stan/math/opencl/matrix_cl.hpp b/stan/math/opencl/matrix_cl.hpp
index 4114c1199c8..476a688ce7d 100644
--- a/stan/math/opencl/matrix_cl.hpp
+++ b/stan/math/opencl/matrix_cl.hpp
@@ -12,7 +12,7 @@
 #include <stan/math/prim/fun/Eigen.hpp>
 #include <stan/math/prim/fun/vec_concat.hpp>
 #include <CL/opencl.hpp>
-#include <tbb/concurrent_vector.h>
+#include <stan/math/opencl/concurrent_vector.h>
 #include <algorithm>
 #include <iostream>
 #include <string>
@@ -51,8 +51,8 @@ class matrix_cl : public matrix_cl_base {
   int cols_{0};           // Number of columns.
   // Holds info on if matrix is a special type
   matrix_cl_view view_{matrix_cl_view::Entire};
-  mutable tbb::concurrent_vector<cl::Event> write_events_;  // Tracks write jobs
-  mutable tbb::concurrent_vector<cl::Event> read_events_;   // Tracks reads
+  mutable internal::concurrent_vector<cl::Event> write_events_;  // Tracks write jobs
+  mutable internal::concurrent_vector<cl::Event> read_events_;   // Tracks reads
 
  public:
   using Scalar = T;  // Underlying type of the matrix
@@ -100,7 +100,7 @@ class matrix_cl : public matrix_cl_base {
    * Get the events from the event stacks.
    * @return The write event stack.
    */
-  inline const tbb::concurrent_vector<cl::Event>& write_events() const {
+  inline const internal::concurrent_vector<cl::Event>& write_events() const {
     return write_events_;
   }
 
@@ -108,7 +108,7 @@ class matrix_cl : public matrix_cl_base {
    * Get the events from the event stacks.
    * @return The read/write event stack.
    */
-  inline const tbb::concurrent_vector<cl::Event>& read_events() const {
+  inline const internal::concurrent_vector<cl::Event>& read_events() const {
     return read_events_;
   }
 
@@ -116,7 +116,7 @@ class matrix_cl : public matrix_cl_base {
    * Get the events from the event stacks.
    * @return The read/write event stack.
    */
-  inline const tbb::concurrent_vector<cl::Event> read_write_events() const {
+  inline const internal::concurrent_vector<cl::Event> read_write_events() const {
     return vec_concat(this->read_events(), this->write_events());
   }
 
diff --git a/stan/math/opencl/opencl_context.hpp b/stan/math/opencl/opencl_context.hpp
index e2373df126d..b0c435e33e0 100644
--- a/stan/math/opencl/opencl_context.hpp
+++ b/stan/math/opencl/opencl_context.hpp
@@ -14,7 +14,7 @@
 #include <stan/math/opencl/err/check_opencl.hpp>
 
 #include <CL/opencl.hpp>
-#include <tbb/concurrent_vector.h>
+#include <stan/math/opencl/concurrent_vector.hpp>
 #include <string>
 #include <iostream>
 #include <fstream>
@@ -208,7 +208,7 @@ class opencl_context_base {
  * The API to access the methods and values in opencl_context_base
  */
 class opencl_context {
-  tbb::concurrent_vector<cl::Kernel*> kernel_caches_;
+  internal::concurrent_vector<cl::Kernel*> kernel_caches_;
 
  public:
   opencl_context() = default;

From ac8a0de9d1bf78854ff4fe628bd63ef952c5b618 Mon Sep 17 00:00:00 2001
From: Daniel Lee <bearlee@alum.mit.edu>
Date: Tue, 23 Dec 2025 14:42:46 -0500
Subject: [PATCH 11/17] more changes

---
 stan/math/prim/functor/reduce_sum_static.hpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/stan/math/prim/functor/reduce_sum_static.hpp b/stan/math/prim/functor/reduce_sum_static.hpp
index 7ad8648aad8..29f70f193d3 100644
--- a/stan/math/prim/functor/reduce_sum_static.hpp
+++ b/stan/math/prim/functor/reduce_sum_static.hpp
@@ -4,9 +4,6 @@
 #include <stan/math/prim/meta.hpp>
 #include <stan/math/prim/err.hpp>
 #include <stan/math/prim/functor/reduce_sum.hpp>
-#include <tbb/task_arena.h>
-#include <tbb/parallel_reduce.h>
-#include <tbb/blocked_range.h>
 
 #include <tuple>
 #include <vector>

From cc68a9f73fc339072accf68458492e62c0306115 Mon Sep 17 00:00:00 2001
From: Daniel Lee <bearlee@alum.mit.edu>
Date: Tue, 23 Dec 2025 15:12:21 -0500
Subject: [PATCH 12/17] fixed header

---
 stan/math/opencl/matrix_cl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stan/math/opencl/matrix_cl.hpp b/stan/math/opencl/matrix_cl.hpp
index 476a688ce7d..4ebc84a1b0a 100644
--- a/stan/math/opencl/matrix_cl.hpp
+++ b/stan/math/opencl/matrix_cl.hpp
@@ -12,7 +12,7 @@
 #include <stan/math/prim/fun/Eigen.hpp>
 #include <stan/math/prim/fun/vec_concat.hpp>
 #include <CL/opencl.hpp>
-#include <stan/math/opencl/concurrent_vector.h>
+#include <stan/math/opencl/concurrent_vector.hpp>
 #include <algorithm>
 #include <iostream>
 #include <string>

From 593440ffe2a51257cc6c263a9729256ab0e17ea0 Mon Sep 17 00:00:00 2001
From: Stan Jenkins <mc.stanislaw@gmail.com>
Date: Tue, 23 Dec 2025 15:17:06 -0500
Subject: [PATCH 13/17] [Jenkins] auto-formatting by clang-format version
 10.0.0-4ubuntu1

---
 stan/math/opencl/concurrent_vector.hpp        | 408 +++++++++---------
 stan/math/opencl/matrix_cl.hpp                |   8 +-
 stan/math/rev/core/simple_thread_pool.hpp     |  19 +-
 stan/math/rev/core/team_thread_pool.hpp       |  57 ++-
 stan/math/rev/functor/map_rect_concurrent.hpp |   5 +-
 stan/math/rev/functor/reduce_sum.hpp          | 104 +++--
 6 files changed, 322 insertions(+), 279 deletions(-)

diff --git a/stan/math/opencl/concurrent_vector.hpp b/stan/math/opencl/concurrent_vector.hpp
index 5daef1d8f7c..46b3e448f19 100644
--- a/stan/math/opencl/concurrent_vector.hpp
+++ b/stan/math/opencl/concurrent_vector.hpp
@@ -15,212 +15,220 @@ namespace stan {
 namespace math {
 namespace internal {
 
-  /**
-   * Minimal segmented concurrent_vector.
-   *
-   * Key properties:
-   *  - concurrent emplace_back/push_back using an atomic size counter.
-   *  - segmented storage => no moving elements during growth, stable addresses.
-   *  - segments allocated lazily; allocation uses CAS to avoid locks.
-   *
-   * Important constraints / notes:
-   *  - operator[] is safe if you only read indices < size() that are known to be constructed.
-   *  - For "publish-then-read" correctness: size() is updated before construction finishes.
-   *    So consumers must not read index i just because i < size(); they must have a stronger
-   *    protocol (e.g., producer hands out index, or you add a "constructed" bitmap).
-   *    This matches common usage where only the pushing thread uses the returned index.
-   *  - clear()/destruction are NOT concurrent with pushes.
-   *
-   * If you need "readers can iterate up to size() safely while writers push",
-   * add a constructed flag per element (see comment near emplace_back()).
-   */
-  template <typename T,
-	    std::size_t BaseSegmentSize = 1024,
-	    std::size_t MaxSegments = 32>
-  class concurrent_vector {
-    static_assert(BaseSegmentSize > 0, "BaseSegmentSize must be > 0");
-    static_assert((BaseSegmentSize & (BaseSegmentSize - 1)) == 0,
-		  "BaseSegmentSize must be a power of two (helps mapping).");
-  public:
-    concurrent_vector() : size_(0) {
-      segments_.resize(MaxSegments);
-      for (auto& p : segments_) p.store(nullptr, std::memory_order_relaxed);
-    }
-
-    concurrent_vector(const concurrent_vector&) = delete;
-    concurrent_vector& operator=(const concurrent_vector&) = delete;
-
-    ~concurrent_vector() { destroy_all_(); }
-
-    std::size_t size() const noexcept {
-      return size_.load(std::memory_order_acquire);
-    }
-
-    bool empty() const noexcept { return size() == 0; }
-
-    // Non-concurrent: safe only when no other threads are pushing/reading.
-    void clear() {
-      destroy_all_();
-      size_.store(0, std::memory_order_release);
-    }
-
-    // Concurrent append (construct in place). Returns the index.
-    template <typename... Args>
-    std::size_t emplace_back(Args&&... args) {
-      // Claim an index
-      const std::size_t idx = size_.fetch_add(1, std::memory_order_acq_rel);
-
-      // Ensure the segment exists
-      T* seg = ensure_segment_for_index_(idx);
-
-      // Placement-new into the correct slot
-      const std::size_t off = offset_in_segment_(idx);
-      T* slot = seg + off;
-
-      // Construct element
-      ::new (static_cast<void*>(slot)) T(std::forward<Args>(args)...);
-
-      // If you need "safe iteration by other threads that use size()",
-      // you must publish construction completion separately, e.g.:
-      // constructed_[idx].store(true, release);
-      // and readers check constructed_[i].load(acquire).
-      return idx;
-    }
-
-    std::size_t push_back(const T& v) { return emplace_back(v); }
-    std::size_t push_back(T&& v) { return emplace_back(std::move(v)); }
-
-    // Returns pointer to element at i (no bounds check).
-    // Safe if element i is fully constructed and lifetime is valid.
-    T* data_at(std::size_t i) noexcept {
-      T* seg = segment_ptr_(segment_index_(i));
-      return seg + offset_in_segment_(i);
-    }
-    const T* data_at(std::size_t i) const noexcept {
-      const T* seg = segment_ptr_(segment_index_(i));
-      return seg + offset_in_segment_(i);
-    }
-
-    // Bounds-checked access (still not concurrent-safe unless you have a protocol).
-    T& at(std::size_t i) {
-      if (i >= size()) throw std::out_of_range("concurrent_vector::at");
-      return *data_at(i);
-    }
-    const T& at(std::size_t i) const {
-      if (i >= size()) throw std::out_of_range("concurrent_vector::at");
-      return *data_at(i);
-    }
-
-    // Unchecked access
-    T& operator[](std::size_t i) noexcept { return *data_at(i); }
-    const T& operator[](std::size_t i) const noexcept { return *data_at(i); }
-
-    // Capacity is segmented and unbounded until MaxSegments is exceeded.
-    // This is the max number of elements representable by the segment scheme.
-    static constexpr std::size_t max_size() noexcept {
-      // Total capacity = Base * (2^MaxSegments - 1)
-      // but beware overflow for large MaxSegments.
-      return BaseSegmentSize * ((std::size_t{1} << MaxSegments) - 1);
-    }
-
-  private:
-    // Segment k has size BaseSegmentSize * 2^k
-    static constexpr std::size_t segment_size_(std::size_t k) noexcept {
-      return BaseSegmentSize << k;
-    }
-
-    // Prefix count before segment k:
-    // Base * (2^k - 1)
-    static constexpr std::size_t segment_prefix_(std::size_t k) noexcept {
-      return BaseSegmentSize * ((std::size_t{1} << k) - 1);
-    }
-
-    // Map global index -> segment index.
-    // Let q = idx / Base. Then segment = floor(log2(q + 1)).
-    static std::size_t segment_index_(std::size_t idx) noexcept {
-      const std::size_t q = idx / BaseSegmentSize;
-      const std::size_t x = q + 1;
+/**
+ * Minimal segmented concurrent_vector.
+ *
+ * Key properties:
+ *  - concurrent emplace_back/push_back using an atomic size counter.
+ *  - segmented storage => no moving elements during growth, stable addresses.
+ *  - segments allocated lazily; allocation uses CAS to avoid locks.
+ *
+ * Important constraints / notes:
+ *  - operator[] is safe if you only read indices < size() that are known to be
+ * constructed.
+ *  - For "publish-then-read" correctness: size() is updated before construction
+ * finishes. So consumers must not read index i just because i < size(); they
+ * must have a stronger protocol (e.g., producer hands out index, or you add a
+ * "constructed" bitmap). This matches common usage where only the pushing
+ * thread uses the returned index.
+ *  - clear()/destruction are NOT concurrent with pushes.
+ *
+ * If you need "readers can iterate up to size() safely while writers push",
+ * add a constructed flag per element (see comment near emplace_back()).
+ */
+template <typename T, std::size_t BaseSegmentSize = 1024,
+          std::size_t MaxSegments = 32>
+class concurrent_vector {
+  static_assert(BaseSegmentSize > 0, "BaseSegmentSize must be > 0");
+  static_assert((BaseSegmentSize & (BaseSegmentSize - 1)) == 0,
+                "BaseSegmentSize must be a power of two (helps mapping).");
+
+ public:
+  concurrent_vector() : size_(0) {
+    segments_.resize(MaxSegments);
+    for (auto& p : segments_)
+      p.store(nullptr, std::memory_order_relaxed);
+  }
+
+  concurrent_vector(const concurrent_vector&) = delete;
+  concurrent_vector& operator=(const concurrent_vector&) = delete;
+
+  ~concurrent_vector() { destroy_all_(); }
+
+  std::size_t size() const noexcept {
+    return size_.load(std::memory_order_acquire);
+  }
+
+  bool empty() const noexcept { return size() == 0; }
+
+  // Non-concurrent: safe only when no other threads are pushing/reading.
+  void clear() {
+    destroy_all_();
+    size_.store(0, std::memory_order_release);
+  }
+
+  // Concurrent append (construct in place). Returns the index.
+  template <typename... Args>
+  std::size_t emplace_back(Args&&... args) {
+    // Claim an index
+    const std::size_t idx = size_.fetch_add(1, std::memory_order_acq_rel);
+
+    // Ensure the segment exists
+    T* seg = ensure_segment_for_index_(idx);
+
+    // Placement-new into the correct slot
+    const std::size_t off = offset_in_segment_(idx);
+    T* slot = seg + off;
+
+    // Construct element
+    ::new (static_cast<void*>(slot)) T(std::forward<Args>(args)...);
+
+    // If you need "safe iteration by other threads that use size()",
+    // you must publish construction completion separately, e.g.:
+    // constructed_[idx].store(true, release);
+    // and readers check constructed_[i].load(acquire).
+    return idx;
+  }
+
+  std::size_t push_back(const T& v) { return emplace_back(v); }
+  std::size_t push_back(T&& v) { return emplace_back(std::move(v)); }
+
+  // Returns pointer to element at i (no bounds check).
+  // Safe if element i is fully constructed and lifetime is valid.
+  T* data_at(std::size_t i) noexcept {
+    T* seg = segment_ptr_(segment_index_(i));
+    return seg + offset_in_segment_(i);
+  }
+  const T* data_at(std::size_t i) const noexcept {
+    const T* seg = segment_ptr_(segment_index_(i));
+    return seg + offset_in_segment_(i);
+  }
+
+  // Bounds-checked access (still not concurrent-safe unless you have a
+  // protocol).
+  T& at(std::size_t i) {
+    if (i >= size())
+      throw std::out_of_range("concurrent_vector::at");
+    return *data_at(i);
+  }
+  const T& at(std::size_t i) const {
+    if (i >= size())
+      throw std::out_of_range("concurrent_vector::at");
+    return *data_at(i);
+  }
+
+  // Unchecked access
+  T& operator[](std::size_t i) noexcept { return *data_at(i); }
+  const T& operator[](std::size_t i) const noexcept { return *data_at(i); }
+
+  // Capacity is segmented and unbounded until MaxSegments is exceeded.
+  // This is the max number of elements representable by the segment scheme.
+  static constexpr std::size_t max_size() noexcept {
+    // Total capacity = Base * (2^MaxSegments - 1)
+    // but beware overflow for large MaxSegments.
+    return BaseSegmentSize * ((std::size_t{1} << MaxSegments) - 1);
+  }
+
+ private:
+  // Segment k has size BaseSegmentSize * 2^k
+  static constexpr std::size_t segment_size_(std::size_t k) noexcept {
+    return BaseSegmentSize << k;
+  }
+
+  // Prefix count before segment k:
+  // Base * (2^k - 1)
+  static constexpr std::size_t segment_prefix_(std::size_t k) noexcept {
+    return BaseSegmentSize * ((std::size_t{1} << k) - 1);
+  }
+
+  // Map global index -> segment index.
+  // Let q = idx / Base. Then segment = floor(log2(q + 1)).
+  static std::size_t segment_index_(std::size_t idx) noexcept {
+    const std::size_t q = idx / BaseSegmentSize;
+    const std::size_t x = q + 1;
 
 #if defined(__GNUG__) || defined(__clang__)
-      // floor(log2(x)) via clz
-      return (sizeof(std::size_t) * 8 - 1) - static_cast<std::size_t>(__builtin_clzl(x));
+    // floor(log2(x)) via clz
+    return (sizeof(std::size_t) * 8 - 1)
+           - static_cast<std::size_t>(__builtin_clzl(x));
 #else
-      // portable fallback
-      std::size_t s = 0;
-      std::size_t t = x;
-      while (t >>= 1) ++s;
-      return s;
+    // portable fallback
+    std::size_t s = 0;
+    std::size_t t = x;
+    while (t >>= 1)
+      ++s;
+    return s;
 #endif
-    }
-
-    static std::size_t offset_in_segment_(std::size_t idx) noexcept {
-      const std::size_t s = segment_index_(idx);
-      return idx - segment_prefix_(s);
-    }
-
-    T* segment_ptr_(std::size_t s) noexcept {
-      return static_cast<T*>(segments_[s].load(std::memory_order_acquire));
-    }
-    const T* segment_ptr_(std::size_t s) const noexcept {
-      return static_cast<const T*>(segments_[s].load(std::memory_order_acquire));
-    }
-
-    T* ensure_segment_for_index_(std::size_t idx) {
-      const std::size_t s = segment_index_(idx);
-      if (s >= MaxSegments) {
-	throw std::length_error("concurrent_vector: exceeded MaxSegments");
-      }
-
-      T* seg = segment_ptr_(s);
-      if (seg) return seg;
-
-      // Allocate segment lazily (raw storage for T objects)
-      const std::size_t n = segment_size_(s);
-      void* raw = ::operator new(sizeof(T) * n);
-      T* fresh = static_cast<T*>(raw);
-
-      // CAS install; if another thread won, free ours.
-      void* expected = nullptr;
-      if (!segments_[s].compare_exchange_strong(
-						expected, fresh,
-						std::memory_order_release,
-						std::memory_order_acquire)) {
-	::operator delete(raw);
-	seg = static_cast<T*>(segments_[s].load(std::memory_order_acquire));
-	assert(seg != nullptr);
-	return seg;
-      }
-
-      return fresh;
-    }
-
-    // Destroy constructed elements and free segments.
-    // Not concurrent with pushes or reads.
-    void destroy_all_() noexcept {
-      const std::size_t n = size_.load(std::memory_order_acquire);
-
-      // Destroy elements that were constructed.
-      // NOTE: This assumes indices [0, n) are all constructed.
-      // If you allow exceptions or partial construction, track constructed flags.
-      for (std::size_t i = 0; i < n; ++i) {
-	data_at(i)->~T();
-      }
-
-      // Free segments
-      for (std::size_t s = 0; s < segments_.size(); ++s) {
-	void* p = segments_[s].load(std::memory_order_acquire);
-	if (p) {
-	  ::operator delete(p);
-	  segments_[s].store(nullptr, std::memory_order_relaxed);
-	}
+  }
+
+  static std::size_t offset_in_segment_(std::size_t idx) noexcept {
+    const std::size_t s = segment_index_(idx);
+    return idx - segment_prefix_(s);
+  }
+
+  T* segment_ptr_(std::size_t s) noexcept {
+    return static_cast<T*>(segments_[s].load(std::memory_order_acquire));
+  }
+  const T* segment_ptr_(std::size_t s) const noexcept {
+    return static_cast<const T*>(segments_[s].load(std::memory_order_acquire));
+  }
+
+  T* ensure_segment_for_index_(std::size_t idx) {
+    const std::size_t s = segment_index_(idx);
+    if (s >= MaxSegments) {
+      throw std::length_error("concurrent_vector: exceeded MaxSegments");
+    }
+
+    T* seg = segment_ptr_(s);
+    if (seg)
+      return seg;
+
+    // Allocate segment lazily (raw storage for T objects)
+    const std::size_t n = segment_size_(s);
+    void* raw = ::operator new(sizeof(T) * n);
+    T* fresh = static_cast<T*>(raw);
+
+    // CAS install; if another thread won, free ours.
+    void* expected = nullptr;
+    if (!segments_[s].compare_exchange_strong(expected, fresh,
+                                              std::memory_order_release,
+                                              std::memory_order_acquire)) {
+      ::operator delete(raw);
+      seg = static_cast<T*>(segments_[s].load(std::memory_order_acquire));
+      assert(seg != nullptr);
+      return seg;
+    }
+
+    return fresh;
+  }
+
+  // Destroy constructed elements and free segments.
+  // Not concurrent with pushes or reads.
+  void destroy_all_() noexcept {
+    const std::size_t n = size_.load(std::memory_order_acquire);
+
+    // Destroy elements that were constructed.
+    // NOTE: This assumes indices [0, n) are all constructed.
+    // If you allow exceptions or partial construction, track constructed flags.
+    for (std::size_t i = 0; i < n; ++i) {
+      data_at(i)->~T();
+    }
+
+    // Free segments
+    for (std::size_t s = 0; s < segments_.size(); ++s) {
+      void* p = segments_[s].load(std::memory_order_acquire);
+      if (p) {
+        ::operator delete(p);
+        segments_[s].store(nullptr, std::memory_order_relaxed);
       }
     }
+  }
 
-    std::atomic<std::size_t> size_;
-    std::vector<std::atomic<void*>> segments_;
-  };
-}
-}
-}
+  std::atomic<std::size_t> size_;
+  std::vector<std::atomic<void*>> segments_;
+};
+}  // namespace internal
+}  // namespace math
+}  // namespace stan
 
 #endif
diff --git a/stan/math/opencl/matrix_cl.hpp b/stan/math/opencl/matrix_cl.hpp
index 4ebc84a1b0a..3d3a4c6e436 100644
--- a/stan/math/opencl/matrix_cl.hpp
+++ b/stan/math/opencl/matrix_cl.hpp
@@ -51,8 +51,9 @@ class matrix_cl : public matrix_cl_base {
   int cols_{0};           // Number of columns.
   // Holds info on if matrix is a special type
   matrix_cl_view view_{matrix_cl_view::Entire};
-  mutable internal::concurrent_vector<cl::Event> write_events_;  // Tracks write jobs
-  mutable internal::concurrent_vector<cl::Event> read_events_;   // Tracks reads
+  mutable internal::concurrent_vector<cl::Event>
+      write_events_;  // Tracks write jobs
+  mutable internal::concurrent_vector<cl::Event> read_events_;  // Tracks reads
 
  public:
   using Scalar = T;  // Underlying type of the matrix
@@ -116,7 +117,8 @@ class matrix_cl : public matrix_cl_base {
    * Get the events from the event stacks.
    * @return The read/write event stack.
    */
-  inline const internal::concurrent_vector<cl::Event> read_write_events() const {
+  inline const internal::concurrent_vector<cl::Event> read_write_events()
+      const {
     return vec_concat(this->read_events(), this->write_events());
   }
 
diff --git a/stan/math/rev/core/simple_thread_pool.hpp b/stan/math/rev/core/simple_thread_pool.hpp
index 84e1adf7c55..16749ee1e5b 100644
--- a/stan/math/rev/core/simple_thread_pool.hpp
+++ b/stan/math/rev/core/simple_thread_pool.hpp
@@ -19,7 +19,7 @@ namespace stan {
 namespace math {
 
 class SimpleThreadPool {
-public:
+ public:
   static SimpleThreadPool& instance() {
     static SimpleThreadPool pool;
     return pool;
@@ -44,7 +44,8 @@ class SimpleThreadPool {
 
   template <typename F>
   void parallel_region(std::size_t n, F&& fn) {
-    if (n == 0) return;
+    if (n == 0)
+      return;
 
     // Avoid nested parallelism deadlocks/oversubscription.
     if (in_worker_) {
@@ -57,7 +58,8 @@ class SimpleThreadPool {
       fn(std::size_t{0});
       return;
     }
-    if (n > tc) n = tc;
+    if (n > tc)
+      n = tc;
 
     using Fn = std::decay_t<F>;
     struct Shared {
@@ -86,10 +88,11 @@ class SimpleThreadPool {
     });
   }
 
-private:
+ private:
   SimpleThreadPool() : done_(false) {
     unsigned hw = std::thread::hardware_concurrency();
-    if (hw == 0) hw = 2;
+    if (hw == 0)
+      hw = 2;
     const unsigned num_threads = hw;
 
     workers_.reserve(num_threads);
@@ -103,7 +106,8 @@ class SimpleThreadPool {
           {
             std::unique_lock<std::mutex> lock(mtx_);
             cv_.wait(lock, [&] { return done_ || !tasks_.empty(); });
-            if (done_ && tasks_.empty()) return;
+            if (done_ && tasks_.empty())
+              return;
             task = std::move(tasks_.front());
             tasks_.pop();
           }
@@ -122,7 +126,8 @@ class SimpleThreadPool {
     }
     cv_.notify_all();
     for (auto& th : workers_) {
-      if (th.joinable()) th.join();
+      if (th.joinable())
+        th.join();
     }
   }
 
diff --git a/stan/math/rev/core/team_thread_pool.hpp b/stan/math/rev/core/team_thread_pool.hpp
index 538006bde81..2a77ff644b9 100644
--- a/stan/math/rev/core/team_thread_pool.hpp
+++ b/stan/math/rev/core/team_thread_pool.hpp
@@ -28,13 +28,15 @@ namespace math {
  * Notes:
  * - Nested parallel_region calls from a worker run serially to avoid deadlock.
  * - Uses an epoch counter + condition_variable to wake workers per region.
- * - Startup barrier ensures all workers are waiting before the first region launch.
+ * - Startup barrier ensures all workers are waiting before the first region
+ * launch.
  */
 class TeamThreadPool {
  public:
   // Total participants INCLUDING caller (tid=0). Call before instance().
   static void set_num_threads(std::size_t n) noexcept {
-    if (n < 1) n = 1;
+    if (n < 1)
+      n = 1;
     user_cap_().store(n, std::memory_order_release);
   }
 
@@ -52,7 +54,8 @@ class TeamThreadPool {
 
   template <typename F>
   void parallel_region(std::size_t n, F&& fn) {
-    if (n == 0) return;
+    if (n == 0)
+      return;
 
     // Prevent nested parallelism from deadlocking the pool.
     if (in_worker_) {
@@ -68,7 +71,8 @@ class TeamThreadPool {
       fn(std::size_t{0});
       return;
     }
-    if (n > max_team) n = max_team;
+    if (n > max_team)
+      n = max_team;
     if (n == 1) {
       fn(std::size_t{0});
       return;
@@ -91,8 +95,8 @@ class TeamThreadPool {
     region_call_.store(&call_impl<Fn>, std::memory_order_release);
 
     // Bump epoch to start the region, then wake workers.
-    const std::size_t new_epoch =
-        epoch_.fetch_add(1, std::memory_order_acq_rel) + 1;
+    const std::size_t new_epoch
+        = epoch_.fetch_add(1, std::memory_order_acq_rel) + 1;
 
     {
       std::lock_guard<std::mutex> lk(wake_m_);
@@ -107,20 +111,21 @@ class TeamThreadPool {
       fn_copy(0);
     } catch (...) {
       std::lock_guard<std::mutex> lk(exc_m_);
-      if (eptr == nullptr) eptr = std::current_exception();
+      if (eptr == nullptr)
+        eptr = std::current_exception();
     }
     in_worker_ = false;
 
     // Wait for workers 1..n-1.
     std::unique_lock<std::mutex> lk(done_m_);
-    done_cv_.wait(lk, [&] {
-      return remaining_.load(std::memory_order_acquire) == 0;
-    });
+    done_cv_.wait(
+        lk, [&] { return remaining_.load(std::memory_order_acquire) == 0; });
 
     // Hygiene.
     region_n_.store(0, std::memory_order_release);
 
-    if (eptr) std::rethrow_exception(eptr);
+    if (eptr)
+      std::rethrow_exception(eptr);
   }
 
  private:
@@ -138,19 +143,25 @@ class TeamThreadPool {
 
   static std::size_t env_num_threads_() noexcept {
     const char* s = std::getenv("STAN_NUM_THREADS");
-    if (!s || !*s) return 0;
+    if (!s || !*s)
+      return 0;
     char* end = nullptr;
     long v = std::strtol(s, &end, 10);
-    if (end == s || v <= 0) return 0;
+    if (end == s || v <= 0)
+      return 0;
     return static_cast<std::size_t>(v);
   }
 
   static std::size_t configured_cap_(std::size_t hw) noexcept {
     std::size_t cap = user_cap_().load(std::memory_order_acquire);
-    if (cap == 0) cap = env_num_threads_();
-    if (cap == 0) cap = hw;
-    if (cap < 1) cap = 1;
-    if (cap > hw) cap = hw;
+    if (cap == 0)
+      cap = env_num_threads_();
+    if (cap == 0)
+      cap = hw;
+    if (cap < 1)
+      cap = 1;
+    if (cap > hw)
+      cap = hw;
     return cap;
   }
 
@@ -164,7 +175,8 @@ class TeamThreadPool {
         exc_ptr_(nullptr),
         ready_count_(0) {
     unsigned hw_u = std::thread::hardware_concurrency();
-    if (hw_u == 0) hw_u = 2;
+    if (hw_u == 0)
+      hw_u = 2;
     const std::size_t hw = static_cast<std::size_t>(hw_u);
 
     const std::size_t cap = configured_cap_(hw);
@@ -197,12 +209,14 @@ class TeamThreadPool {
               return stop_.load(std::memory_order_acquire)
                      || epoch_.load(std::memory_order_acquire) != seen_epoch;
             });
-            if (stop_.load(std::memory_order_acquire)) break;
+            if (stop_.load(std::memory_order_acquire))
+              break;
             seen_epoch = epoch_.load(std::memory_order_acquire);
           }
 
           const std::size_t n = region_n_.load(std::memory_order_acquire);
-          if (tid >= n) continue;  // not participating this region
+          if (tid >= n)
+            continue;  // not participating this region
 
           // Always decrement once for participating workers.
           struct DoneGuard {
@@ -255,7 +269,8 @@ class TeamThreadPool {
     wake_cv_.notify_all();
 
     for (auto& t : workers_) {
-      if (t.joinable()) t.join();
+      if (t.joinable())
+        t.join();
     }
   }
 
diff --git a/stan/math/rev/functor/map_rect_concurrent.hpp b/stan/math/rev/functor/map_rect_concurrent.hpp
index b87344f5c7e..76d38866885 100644
--- a/stan/math/rev/functor/map_rect_concurrent.hpp
+++ b/stan/math/rev/functor/map_rect_concurrent.hpp
@@ -53,9 +53,8 @@ map_rect_concurrent(
 
   // Total participants includes caller (tid=0).
   const std::size_t max_team = pool.team_size();
-  const std::size_t n = std::min<std::size_t>(max_team,
-                                              num_jobs == 0 ? 1u
-					      : num_jobs);
+  const std::size_t n
+      = std::min<std::size_t>(max_team, num_jobs == 0 ? 1u : num_jobs);
 
   if (n <= 1 || num_jobs <= 1) {
     execute_chunk(0, num_jobs);
diff --git a/stan/math/rev/functor/reduce_sum.hpp b/stan/math/rev/functor/reduce_sum.hpp
index f3b9dd0136c..814bc6e3f59 100644
--- a/stan/math/rev/functor/reduce_sum.hpp
+++ b/stan/math/rev/functor/reduce_sum.hpp
@@ -21,12 +21,12 @@ namespace internal {
 
 template <typename ReduceFunction, typename ReturnType, typename Vec,
           typename... Args>
-struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType, Vec,
-                       Args...> {
+struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType,
+                       Vec, Args...> {
   struct scoped_args_tuple {
     ScopedChainableStack stack_;
-    using args_tuple_t =
-        std::tuple<decltype(deep_copy_vars(std::declval<std::decay_t<Args>>()))...>;
+    using args_tuple_t = std::tuple<decltype(
+        deep_copy_vars(std::declval<std::decay_t<Args>>()))...>;
     std::unique_ptr<args_tuple_t> args_tuple_holder_;
     scoped_args_tuple() : stack_(), args_tuple_holder_(nullptr) {}
   };
@@ -70,21 +70,21 @@ struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType, Ve
 
     recursive_reducer(std::size_t num_vars_per_term,
                       std::size_t num_vars_shared_terms,
-                      double* sliced_partials,
-                      const VecRef& vmapped,
-                      std::ostream* msgs,
-                      const std::decay_t<Args>&... args)
+                      double* sliced_partials, const VecRef& vmapped,
+                      std::ostream* msgs, const std::decay_t<Args>&... args)
         : num_vars_per_term_(num_vars_per_term),
           num_vars_shared_terms_(num_vars_shared_terms),
           sliced_partials_(sliced_partials),
           vmapped_(&vmapped),
           args_ptrs_(&args...),
           msgs_out_(msgs) {
-      if (msgs_out_) msgs_ = std::make_unique<std::stringstream>();
+      if (msgs_out_)
+        msgs_ = std::make_unique<std::stringstream>();
     }
 
     inline void operator()(std::size_t begin, std::size_t end) {
-      if (begin >= end) return;
+      if (begin >= end)
+        return;
 
       if (args_adjoints_.empty()) {
         args_adjoints_.assign(num_vars_shared_terms_, 0.0);
@@ -92,11 +92,13 @@ struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType, Ve
 
       if (!local_args_tuple_scope_.args_tuple_holder_) {
         local_args_tuple_scope_.stack_.execute([&]() {
-          apply_ptr_tuple([&](auto const&... a) {
-            local_args_tuple_scope_.args_tuple_holder_ =
-                std::make_unique<typename scoped_args_tuple::args_tuple_t>(
+          apply_ptr_tuple(
+              [&](auto const&... a) {
+                local_args_tuple_scope_.args_tuple_holder_ = std::make_unique<
+                    typename scoped_args_tuple::args_tuple_t>(
                     deep_copy_vars(a)...);
-          }, args_ptrs_);
+              },
+              args_ptrs_);
         });
       } else {
         local_args_tuple_scope_.stack_.execute([] { set_zero_all_adjoints(); });
@@ -118,13 +120,13 @@ struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType, Ve
         local_sub_slice_.emplace_back(deep_copy_vars((*vmapped_)[i]));
       }
 
-      std::ostream* local_msgs =
-          msgs_ ? static_cast<std::ostream*>(msgs_.get()) : nullptr;
+      std::ostream* local_msgs
+          = msgs_ ? static_cast<std::ostream*>(msgs_.get()) : nullptr;
 
       var sub_sum_v = math::apply(
           [&](auto&&... args_local) {
-            return ReduceFunction()(local_sub_slice_, begin, end - 1, local_msgs,
-                                    args_local...);
+            return ReduceFunction()(local_sub_slice_, begin, end - 1,
+                                    local_msgs, args_local...);
           },
           args_tuple_local);
 
@@ -145,43 +147,48 @@ struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType, Ve
     }
   };
 
-  inline var operator()(Vec&& vmapped, bool /*auto_partitioning*/, int grainsize,
-                        std::ostream* msgs, Args&&... args) const {
-    if (vmapped.empty()) return var(0.0);
+  inline var operator()(Vec&& vmapped, bool /*auto_partitioning*/,
+                        int grainsize, std::ostream* msgs,
+                        Args&&... args) const {
+    if (vmapped.empty())
+      return var(0.0);
 
     const std::size_t num_terms = vmapped.size();
     const std::size_t num_vars_per_term = count_vars(vmapped[0]);
     const std::size_t num_vars_sliced_terms = num_terms * num_vars_per_term;
     const std::size_t num_vars_shared_terms = count_vars(args...);
 
-    vari** varis =
-        ChainableStack::instance_->memalloc_.alloc_array<vari*>(
-            num_vars_sliced_terms + num_vars_shared_terms);
-    double* partials =
-        ChainableStack::instance_->memalloc_.alloc_array<double>(
-            num_vars_sliced_terms + num_vars_shared_terms);
+    vari** varis = ChainableStack::instance_->memalloc_.alloc_array<vari*>(
+        num_vars_sliced_terms + num_vars_shared_terms);
+    double* partials = ChainableStack::instance_->memalloc_.alloc_array<double>(
+        num_vars_sliced_terms + num_vars_shared_terms);
 
     save_varis(varis, vmapped);
     save_varis(varis + num_vars_sliced_terms, args...);
 
-    for (std::size_t i = 0; i < num_vars_sliced_terms; ++i) partials[i] = 0.0;
+    for (std::size_t i = 0; i < num_vars_sliced_terms; ++i)
+      partials[i] = 0.0;
 
     auto& pool = stan::math::TeamThreadPool::instance();
     const std::size_t max_team = pool.team_size();
 
     // Choose workers. (Caller participates, so total participants = n)
-    std::size_t n = std::min<std::size_t>(max_team, num_terms == 0 ? 1 : num_terms);
-    if (n < 1) n = 1;
+    std::size_t n
+        = std::min<std::size_t>(max_team, num_terms == 0 ? 1 : num_terms);
+    if (n < 1)
+      n = 1;
 
     // Chunking: default to ~2 chunks per participant (lower overhead).
     std::size_t gs;
     if (grainsize > 0) {
       gs = static_cast<std::size_t>(grainsize);
-      if (gs < 1) gs = 1;
+      if (gs < 1)
+        gs = 1;
     } else {
       const std::size_t target_chunks = n * 2;
       gs = (num_terms + target_chunks - 1) / target_chunks;
-      if (gs < 1) gs = 1;
+      if (gs < 1)
+        gs = 1;
     }
 
     // Serial cutoff: if too few terms, don't parallelize.
@@ -201,29 +208,34 @@ struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType, Ve
         }
       }
 
-      if (msgs && r.msgs_) *msgs << r.msgs_->str();
+      if (msgs && r.msgs_)
+        *msgs << r.msgs_->str();
 
       return var(new precomputed_gradients_vari(
-          r.sum_, num_vars_sliced_terms + num_vars_shared_terms, varis, partials));
+          r.sum_, num_vars_sliced_terms + num_vars_shared_terms, varis,
+          partials));
     }
 
     // One reducer per participant (0..n-1) for static partitioning.
-    // NOTE: we avoid copying vmapped/args by taking references/pointers inside reducer.
+    // NOTE: we avoid copying vmapped/args by taking references/pointers inside
+    // reducer.
     std::vector<std::unique_ptr<recursive_reducer>> workers;
     workers.reserve(n);
     for (std::size_t tid = 0; tid < n; ++tid) {
       workers.emplace_back(std::make_unique<recursive_reducer>(
-          num_vars_per_term, num_vars_shared_terms, partials,
-          vmapped, msgs, args...));
+          num_vars_per_term, num_vars_shared_terms, partials, vmapped, msgs,
+          args...));
     }
     /*
-    std::cout << "--------------------------------------------------------------------------------" << std::endl
-	      << "worker count = " << pool.worker_count() << std::endl
-	      << "team size = " << pool.team_size() << std::endl
-	      << "gs = " << gs << std::endl
-	      << std::endl << std::endl;
+    std::cout <<
+    "--------------------------------------------------------------------------------"
+    << std::endl
+              << "worker count = " << pool.worker_count() << std::endl
+              << "team size = " << pool.team_size() << std::endl
+              << "gs = " << gs << std::endl
+              << std::endl << std::endl;
     */
-    
+
     // Static partition: each participant gets a contiguous block once
     pool.parallel_region(n, [&](std::size_t tid) {
       const std::size_t b0 = (num_terms * tid) / n;
@@ -255,10 +267,12 @@ struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType, Ve
       partials[num_vars_sliced_terms + i] = shared_adj[i];
     }
 
-    if (msgs) *msgs << all_msgs.str();
+    if (msgs)
+      *msgs << all_msgs.str();
 
     return var(new precomputed_gradients_vari(
-        total_sum, num_vars_sliced_terms + num_vars_shared_terms, varis, partials));
+        total_sum, num_vars_sliced_terms + num_vars_shared_terms, varis,
+        partials));
   }
 };
 

From 9d4a4aec2417086085f333c535eac29fdc2b69d0 Mon Sep 17 00:00:00 2001
From: Daniel Lee <bearlee@alum.mit.edu>
Date: Tue, 23 Dec 2025 21:38:58 -0500
Subject: [PATCH 14/17] updating concurrent_vector

---
 stan/math/opencl/concurrent_vector.hpp | 290 +++++++++++++++++--------
 stan/math/opencl/kernel_cl.hpp         |   8 +-
 stan/math/opencl/opencl_context.hpp    |   7 +-
 3 files changed, 206 insertions(+), 99 deletions(-)

diff --git a/stan/math/opencl/concurrent_vector.hpp b/stan/math/opencl/concurrent_vector.hpp
index 46b3e448f19..095643daf61 100644
--- a/stan/math/opencl/concurrent_vector.hpp
+++ b/stan/math/opencl/concurrent_vector.hpp
@@ -2,58 +2,77 @@
 #define STAN_MATH_OPENCL_CONCURRENT_VECTOR_HPP
 
 #include <atomic>
+#include <array>
+#include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <iterator>
 #include <new>
+#include <stdexcept>
 #include <type_traits>
 #include <utility>
-#include <vector>
-#include <stdexcept>
-#include <cassert>
 
 namespace stan {
 namespace math {
 namespace internal {
 
 /**
- * Minimal segmented concurrent_vector.
- *
- * Key properties:
- *  - concurrent emplace_back/push_back using an atomic size counter.
- *  - segmented storage => no moving elements during growth, stable addresses.
- *  - segments allocated lazily; allocation uses CAS to avoid locks.
+ * Segmented concurrent_vector.
  *
- * Important constraints / notes:
- *  - operator[] is safe if you only read indices < size() that are known to be
- * constructed.
- *  - For "publish-then-read" correctness: size() is updated before construction
- * finishes. So consumers must not read index i just because i < size(); they
- * must have a stronger protocol (e.g., producer hands out index, or you add a
- * "constructed" bitmap). This matches common usage where only the pushing
- * thread uses the returned index.
- *  - clear()/destruction are NOT concurrent with pushes.
+ * Properties:
+ *  - concurrent emplace_back/push_back via atomic size counter
+ *  - segmented storage => no moving elements during growth, stable addresses
+ *  - segments allocated lazily; allocation uses CAS to avoid locks
+ *  - movable (required for return-by-value usage such as vec_concat)
  *
- * If you need "readers can iterate up to size() safely while writers push",
- * add a constructed flag per element (see comment near emplace_back()).
+ * Notes:
+ *  - Intended for append-then-read patterns.
+ *  - size_ increments before construction finishes. If readers iterate up to size()
+ *    concurrently with writers, you need a constructed/published protocol.
+ *  - clear()/destruction are NOT concurrent with pushes/reads.
  */
 template <typename T, std::size_t BaseSegmentSize = 1024,
           std::size_t MaxSegments = 32>
 class concurrent_vector {
   static_assert(BaseSegmentSize > 0, "BaseSegmentSize must be > 0");
   static_assert((BaseSegmentSize & (BaseSegmentSize - 1)) == 0,
-                "BaseSegmentSize must be a power of two (helps mapping).");
+                "BaseSegmentSize must be a power of two");
+  static_assert(MaxSegments > 0, "MaxSegments must be > 0");
 
  public:
-  concurrent_vector() : size_(0) {
-    segments_.resize(MaxSegments);
-    for (auto& p : segments_)
-      p.store(nullptr, std::memory_order_relaxed);
+  concurrent_vector() noexcept : size_(0) {
+    for (auto& p : segments_) p.store(nullptr, std::memory_order_relaxed);
   }
 
-  concurrent_vector(const concurrent_vector&) = delete;
-  concurrent_vector& operator=(const concurrent_vector&) = delete;
+  concurrent_vector(const concurrent_vector& other) : concurrent_vector() {
+    copy_from_(other);
+  }
 
-  ~concurrent_vector() { destroy_all_(); }
+  concurrent_vector& operator=(const concurrent_vector& other) {
+    if (this != &other) {
+      clear();
+      copy_from_(other);
+    }
+    return *this;
+  }
+  
+  // Movable (needed so Stan can return-by-value)
+  concurrent_vector(concurrent_vector&& other) noexcept : size_(0) {
+    for (auto& p : segments_) p.store(nullptr, std::memory_order_relaxed);
+    move_from_(other);
+  }
+
+  concurrent_vector& operator=(concurrent_vector&& other) noexcept {
+    if (this != &other) {
+      destroy_all_();
+      for (auto& p : segments_) p.store(nullptr, std::memory_order_relaxed);
+      size_.store(0, std::memory_order_relaxed);
+      move_from_(other);
+    }
+    return *this;
+  }
+
+  ~concurrent_vector() noexcept { destroy_all_(); }
 
   std::size_t size() const noexcept {
     return size_.load(std::memory_order_acquire);
@@ -61,40 +80,39 @@ class concurrent_vector {
 
   bool empty() const noexcept { return size() == 0; }
 
-  // Non-concurrent: safe only when no other threads are pushing/reading.
+  // Not concurrent with pushes/reads.
   void clear() {
     destroy_all_();
     size_.store(0, std::memory_order_release);
   }
 
-  // Concurrent append (construct in place). Returns the index.
+  // Pre-allocate enough segments to back indices [0, capacity-1].
+  // Safe to call concurrently with emplace_back (may race allocating segments; losers free).
+  void reserve(std::size_t capacity) {
+    if (capacity == 0) return;
+    const std::size_t last = capacity - 1;
+    const std::size_t last_seg = segment_index_(last);
+    if (last_seg >= MaxSegments) {
+      throw std::length_error("concurrent_vector::reserve: exceeds MaxSegments");
+    }
+    for (std::size_t s = 0; s <= last_seg; ++s) {
+      ensure_segment_(s);
+    }
+  }
+
   template <typename... Args>
   std::size_t emplace_back(Args&&... args) {
-    // Claim an index
     const std::size_t idx = size_.fetch_add(1, std::memory_order_acq_rel);
-
-    // Ensure the segment exists
     T* seg = ensure_segment_for_index_(idx);
-
-    // Placement-new into the correct slot
-    const std::size_t off = offset_in_segment_(idx);
-    T* slot = seg + off;
-
-    // Construct element
+    T* slot = seg + offset_in_segment_(idx);
     ::new (static_cast<void*>(slot)) T(std::forward<Args>(args)...);
-
-    // If you need "safe iteration by other threads that use size()",
-    // you must publish construction completion separately, e.g.:
-    // constructed_[idx].store(true, release);
-    // and readers check constructed_[i].load(acquire).
     return idx;
   }
 
   std::size_t push_back(const T& v) { return emplace_back(v); }
   std::size_t push_back(T&& v) { return emplace_back(std::move(v)); }
 
-  // Returns pointer to element at i (no bounds check).
-  // Safe if element i is fully constructed and lifetime is valid.
+  // Pointer helper (no bounds check).
   T* data_at(std::size_t i) noexcept {
     T* seg = segment_ptr_(segment_index_(i));
     return seg + offset_in_segment_(i);
@@ -104,30 +122,99 @@ class concurrent_vector {
     return seg + offset_in_segment_(i);
   }
 
-  // Bounds-checked access (still not concurrent-safe unless you have a
-  // protocol).
+  // Bounds-checked access.
   T& at(std::size_t i) {
-    if (i >= size())
-      throw std::out_of_range("concurrent_vector::at");
+    if (i >= size()) throw std::out_of_range("concurrent_vector::at");
     return *data_at(i);
   }
   const T& at(std::size_t i) const {
-    if (i >= size())
-      throw std::out_of_range("concurrent_vector::at");
+    if (i >= size()) throw std::out_of_range("concurrent_vector::at");
     return *data_at(i);
   }
 
-  // Unchecked access
+  // Unchecked access.
   T& operator[](std::size_t i) noexcept { return *data_at(i); }
   const T& operator[](std::size_t i) const noexcept { return *data_at(i); }
 
-  // Capacity is segmented and unbounded until MaxSegments is exceeded.
-  // This is the max number of elements representable by the segment scheme.
-  static constexpr std::size_t max_size() noexcept {
-    // Total capacity = Base * (2^MaxSegments - 1)
-    // but beware overflow for large MaxSegments.
-    return BaseSegmentSize * ((std::size_t{1} << MaxSegments) - 1);
+  // -------------------------
+  // Iterators (InputIterator is enough for std::vector(first,last))
+  // -------------------------
+
+  class iterator {
+   public:
+    using iterator_category = std::input_iterator_tag;
+    using value_type = T;
+    using difference_type = std::ptrdiff_t;
+    using pointer = T*;
+    using reference = T&;
+
+    iterator() : v_(nullptr), i_(0) {}
+    iterator(concurrent_vector* v, std::size_t i) : v_(v), i_(i) {}
+
+    reference operator*() const { return (*v_)[i_]; }
+    pointer operator->() const { return &(*v_)[i_]; }
+
+    iterator& operator++() { ++i_; return *this; }
+    iterator operator++(int) { iterator tmp = *this; ++(*this); return tmp; }
+
+    friend bool operator==(const iterator& a, const iterator& b) {
+      return a.v_ == b.v_ && a.i_ == b.i_;
+    }
+    friend bool operator!=(const iterator& a, const iterator& b) { return !(a == b); }
+
+   private:
+    concurrent_vector* v_;
+    std::size_t i_;
+  };
+
+  class const_iterator {
+   public:
+    using iterator_category = std::input_iterator_tag;
+    using value_type = T;
+    using difference_type = std::ptrdiff_t;
+    using pointer = const T*;
+    using reference = const T&;
+
+    const_iterator() : v_(nullptr), i_(0) {}
+    const_iterator(const concurrent_vector* v, std::size_t i) : v_(v), i_(i) {}
+
+    reference operator*() const { return (*v_)[i_]; }
+    pointer operator->() const { return &(*v_)[i_]; }
+
+    const_iterator& operator++() { ++i_; return *this; }
+    const_iterator operator++(int) { const_iterator tmp = *this; ++(*this); return tmp; }
+
+    friend bool operator==(const const_iterator& a, const const_iterator& b) {
+      return a.v_ == b.v_ && a.i_ == b.i_;
+    }
+    friend bool operator!=(const const_iterator& a, const const_iterator& b) { return !(a == b); }
+
+   private:
+    const concurrent_vector* v_;
+    std::size_t i_;
+  };
+
+  iterator begin() noexcept { return iterator(this, 0); }
+  iterator end() noexcept { return iterator(this, size()); }  // snapshot at call time
+
+  const_iterator begin() const noexcept { return const_iterator(this, 0); }
+  const_iterator end() const noexcept { return const_iterator(this, size()); }
+
+  const_iterator cbegin() const noexcept { return const_iterator(this, 0); }
+  const_iterator cend() const noexcept { return const_iterator(this, size()); }
+
+  T& back() {
+    const std::size_t n = size();
+    if (n == 0) throw std::out_of_range("concurrent_vector::back on empty");
+    return (*this)[n - 1];
+  }
+
+  const T& back() const {
+    const std::size_t n = size();
+    if (n == 0) throw std::out_of_range("concurrent_vector::back on empty");
+    return (*this)[n - 1];
   }
+  // -------------------------
 
  private:
   // Segment k has size BaseSegmentSize * 2^k
@@ -135,28 +222,29 @@ class concurrent_vector {
     return BaseSegmentSize << k;
   }
 
-  // Prefix count before segment k:
-  // Base * (2^k - 1)
+  // Prefix elements before segment k: Base * (2^k - 1)
   static constexpr std::size_t segment_prefix_(std::size_t k) noexcept {
     return BaseSegmentSize * ((std::size_t{1} << k) - 1);
   }
 
-  // Map global index -> segment index.
+  // Map global index -> segment index
   // Let q = idx / Base. Then segment = floor(log2(q + 1)).
   static std::size_t segment_index_(std::size_t idx) noexcept {
     const std::size_t q = idx / BaseSegmentSize;
     const std::size_t x = q + 1;
 
 #if defined(__GNUG__) || defined(__clang__)
-    // floor(log2(x)) via clz
-    return (sizeof(std::size_t) * 8 - 1)
-           - static_cast<std::size_t>(__builtin_clzl(x));
+    if constexpr (sizeof(std::size_t) == 8) {
+      return 63u - static_cast<std::size_t>(
+                       __builtin_clzll(static_cast<unsigned long long>(x)));
+    } else {
+      return 31u - static_cast<std::size_t>(
+                       __builtin_clzl(static_cast<unsigned long>(x)));
+    }
 #else
-    // portable fallback
     std::size_t s = 0;
     std::size_t t = x;
-    while (t >>= 1)
-      ++s;
+    while (t >>= 1) ++s;
     return s;
 #endif
   }
@@ -173,60 +261,76 @@ class concurrent_vector {
     return static_cast<const T*>(segments_[s].load(std::memory_order_acquire));
   }
 
-  T* ensure_segment_for_index_(std::size_t idx) {
-    const std::size_t s = segment_index_(idx);
-    if (s >= MaxSegments) {
-      throw std::length_error("concurrent_vector: exceeded MaxSegments");
-    }
-
+  T* ensure_segment_(std::size_t s) {
     T* seg = segment_ptr_(s);
-    if (seg)
-      return seg;
+    if (seg) return seg;
 
-    // Allocate segment lazily (raw storage for T objects)
     const std::size_t n = segment_size_(s);
     void* raw = ::operator new(sizeof(T) * n);
     T* fresh = static_cast<T*>(raw);
 
-    // CAS install; if another thread won, free ours.
     void* expected = nullptr;
     if (!segments_[s].compare_exchange_strong(expected, fresh,
                                               std::memory_order_release,
                                               std::memory_order_acquire)) {
       ::operator delete(raw);
-      seg = static_cast<T*>(segments_[s].load(std::memory_order_acquire));
+      seg = segment_ptr_(s);
       assert(seg != nullptr);
       return seg;
     }
-
     return fresh;
   }
 
-  // Destroy constructed elements and free segments.
-  // Not concurrent with pushes or reads.
+  T* ensure_segment_for_index_(std::size_t idx) {
+    const std::size_t s = segment_index_(idx);
+    if (s >= MaxSegments) {
+      throw std::length_error("concurrent_vector: exceeded MaxSegments");
+    }
+    return ensure_segment_(s);
+  }
+
   void destroy_all_() noexcept {
     const std::size_t n = size_.load(std::memory_order_acquire);
 
-    // Destroy elements that were constructed.
-    // NOTE: This assumes indices [0, n) are all constructed.
-    // If you allow exceptions or partial construction, track constructed flags.
+    // Assumes [0, n) constructed.
     for (std::size_t i = 0; i < n; ++i) {
       data_at(i)->~T();
     }
 
-    // Free segments
-    for (std::size_t s = 0; s < segments_.size(); ++s) {
-      void* p = segments_[s].load(std::memory_order_acquire);
-      if (p) {
-        ::operator delete(p);
-        segments_[s].store(nullptr, std::memory_order_relaxed);
-      }
+    for (auto& a : segments_) {
+      void* p = a.exchange(nullptr, std::memory_order_acq_rel);
+      if (p) ::operator delete(p);
+    }
+  }
+
+  void move_from_(concurrent_vector& other) noexcept {
+    // Steal size
+    const std::size_t n = other.size_.exchange(0, std::memory_order_acq_rel);
+    size_.store(n, std::memory_order_release);
+
+    // Steal segments
+    for (std::size_t s = 0; s < MaxSegments; ++s) {
+      void* p = other.segments_[s].exchange(nullptr, std::memory_order_acq_rel);
+      segments_[s].store(p, std::memory_order_release);
     }
   }
 
+  void copy_from_(const concurrent_vector& other) {
+    const std::size_t n = other.size();
+    if (n == 0) return;
+
+    reserve(n);
+    // Important: we want size_ to match, but we must construct elements.
+    // Use emplace_back so construction happens in this container.
+    for (std::size_t i = 0; i < n; ++i) {
+      emplace_back(other[i]);
+    }
+  }
+  
   std::atomic<std::size_t> size_;
-  std::vector<std::atomic<void*>> segments_;
+  std::array<std::atomic<void*>, MaxSegments> segments_;
 };
+
 }  // namespace internal
 }  // namespace math
 }  // namespace stan
diff --git a/stan/math/opencl/kernel_cl.hpp b/stan/math/opencl/kernel_cl.hpp
index 8905243d709..cd97bda09c0 100644
--- a/stan/math/opencl/kernel_cl.hpp
+++ b/stan/math/opencl/kernel_cl.hpp
@@ -109,17 +109,17 @@ inline void assign_events(const cl::Event& new_event, CallArg& m,
  * @return A vector of OpenCL events.
  */
 template <typename T, require_not_matrix_cl_t<T>* = nullptr>
-inline internal::concurrent_vector<cl::Event> select_events(const T& m) {
-  return internal::concurrent_vector<cl::Event>{};
+inline stan::math::internal::concurrent_vector<cl::Event> select_events(const T& m) {
+  return stan::math::internal::concurrent_vector<cl::Event>{};
 }
 template <typename T, typename K, require_matrix_cl_t<K>* = nullptr,
           require_same_t<T, in_buffer>* = nullptr>
-inline const internal::concurrent_vector<cl::Event>& select_events(const K& m) {
+inline const stan::math::internal::concurrent_vector<cl::Event>& select_events(const K& m) {
   return m.write_events();
 }
 template <typename T, typename K, require_matrix_cl_t<K>* = nullptr,
           require_any_same_t<T, out_buffer, in_out_buffer>* = nullptr>
-inline internal::concurrent_vector<cl::Event> select_events(K& m) {
+inline stan::math::internal::concurrent_vector<cl::Event> select_events(K& m) {
   static_assert(!std::is_const<K>::value, "Can not write to const matrix_cl!");
   return m.read_write_events();
 }
diff --git a/stan/math/opencl/opencl_context.hpp b/stan/math/opencl/opencl_context.hpp
index b0c435e33e0..6f83296e39b 100644
--- a/stan/math/opencl/opencl_context.hpp
+++ b/stan/math/opencl/opencl_context.hpp
@@ -417,8 +417,11 @@ class opencl_context {
    * @param instance_id if of the device
    */
   inline void select_device(int platform_id, int instance_id) {
-    for (cl::Kernel* cache : kernel_caches_) {
-      *cache = cl::Kernel();
+    for (std::size_t i = 0; i < kernel_caches_.size(); ++i) {
+      cl::Kernel*& cache = kernel_caches_[i];
+      if (cache) {
+	*cache = cl::Kernel();
+      }
     }
     kernel_caches_.clear();
     opencl_context_base::select_device(platform_id, instance_id);

From 1b7c545755a3850de76980f3dc87795e057e4b3b Mon Sep 17 00:00:00 2001
From: Stan Jenkins <mc.stanislaw@gmail.com>
Date: Tue, 23 Dec 2025 21:39:58 -0500
Subject: [PATCH 15/17] [Jenkins] auto-formatting by clang-format version
 10.0.0-4ubuntu1

---
 stan/math/opencl/concurrent_vector.hpp | 94 ++++++++++++++++++--------
 stan/math/opencl/kernel_cl.hpp         |  6 +-
 stan/math/opencl/opencl_context.hpp    |  2 +-
 3 files changed, 70 insertions(+), 32 deletions(-)

diff --git a/stan/math/opencl/concurrent_vector.hpp b/stan/math/opencl/concurrent_vector.hpp
index 095643daf61..1dcadd49152 100644
--- a/stan/math/opencl/concurrent_vector.hpp
+++ b/stan/math/opencl/concurrent_vector.hpp
@@ -27,8 +27,8 @@ namespace internal {
  *
  * Notes:
  *  - Intended for append-then-read patterns.
- *  - size_ increments before construction finishes. If readers iterate up to size()
- *    concurrently with writers, you need a constructed/published protocol.
+ *  - size_ increments before construction finishes. If readers iterate up to
+ * size() concurrently with writers, you need a constructed/published protocol.
  *  - clear()/destruction are NOT concurrent with pushes/reads.
  */
 template <typename T, std::size_t BaseSegmentSize = 1024,
@@ -41,7 +41,8 @@ class concurrent_vector {
 
  public:
   concurrent_vector() noexcept : size_(0) {
-    for (auto& p : segments_) p.store(nullptr, std::memory_order_relaxed);
+    for (auto& p : segments_)
+      p.store(nullptr, std::memory_order_relaxed);
   }
 
   concurrent_vector(const concurrent_vector& other) : concurrent_vector() {
@@ -55,17 +56,19 @@ class concurrent_vector {
     }
     return *this;
   }
-  
+
   // Movable (needed so Stan can return-by-value)
   concurrent_vector(concurrent_vector&& other) noexcept : size_(0) {
-    for (auto& p : segments_) p.store(nullptr, std::memory_order_relaxed);
+    for (auto& p : segments_)
+      p.store(nullptr, std::memory_order_relaxed);
     move_from_(other);
   }
 
   concurrent_vector& operator=(concurrent_vector&& other) noexcept {
     if (this != &other) {
       destroy_all_();
-      for (auto& p : segments_) p.store(nullptr, std::memory_order_relaxed);
+      for (auto& p : segments_)
+        p.store(nullptr, std::memory_order_relaxed);
       size_.store(0, std::memory_order_relaxed);
       move_from_(other);
     }
@@ -87,13 +90,16 @@ class concurrent_vector {
   }
 
   // Pre-allocate enough segments to back indices [0, capacity-1].
-  // Safe to call concurrently with emplace_back (may race allocating segments; losers free).
+  // Safe to call concurrently with emplace_back (may race allocating segments;
+  // losers free).
   void reserve(std::size_t capacity) {
-    if (capacity == 0) return;
+    if (capacity == 0)
+      return;
     const std::size_t last = capacity - 1;
     const std::size_t last_seg = segment_index_(last);
     if (last_seg >= MaxSegments) {
-      throw std::length_error("concurrent_vector::reserve: exceeds MaxSegments");
+      throw std::length_error(
+          "concurrent_vector::reserve: exceeds MaxSegments");
     }
     for (std::size_t s = 0; s <= last_seg; ++s) {
       ensure_segment_(s);
@@ -124,11 +130,13 @@ class concurrent_vector {
 
   // Bounds-checked access.
   T& at(std::size_t i) {
-    if (i >= size()) throw std::out_of_range("concurrent_vector::at");
+    if (i >= size())
+      throw std::out_of_range("concurrent_vector::at");
     return *data_at(i);
   }
   const T& at(std::size_t i) const {
-    if (i >= size()) throw std::out_of_range("concurrent_vector::at");
+    if (i >= size())
+      throw std::out_of_range("concurrent_vector::at");
     return *data_at(i);
   }
 
@@ -154,13 +162,22 @@ class concurrent_vector {
     reference operator*() const { return (*v_)[i_]; }
     pointer operator->() const { return &(*v_)[i_]; }
 
-    iterator& operator++() { ++i_; return *this; }
-    iterator operator++(int) { iterator tmp = *this; ++(*this); return tmp; }
+    iterator& operator++() {
+      ++i_;
+      return *this;
+    }
+    iterator operator++(int) {
+      iterator tmp = *this;
+      ++(*this);
+      return tmp;
+    }
 
     friend bool operator==(const iterator& a, const iterator& b) {
       return a.v_ == b.v_ && a.i_ == b.i_;
     }
-    friend bool operator!=(const iterator& a, const iterator& b) { return !(a == b); }
+    friend bool operator!=(const iterator& a, const iterator& b) {
+      return !(a == b);
+    }
 
    private:
     concurrent_vector* v_;
@@ -181,13 +198,22 @@ class concurrent_vector {
     reference operator*() const { return (*v_)[i_]; }
     pointer operator->() const { return &(*v_)[i_]; }
 
-    const_iterator& operator++() { ++i_; return *this; }
-    const_iterator operator++(int) { const_iterator tmp = *this; ++(*this); return tmp; }
+    const_iterator& operator++() {
+      ++i_;
+      return *this;
+    }
+    const_iterator operator++(int) {
+      const_iterator tmp = *this;
+      ++(*this);
+      return tmp;
+    }
 
     friend bool operator==(const const_iterator& a, const const_iterator& b) {
       return a.v_ == b.v_ && a.i_ == b.i_;
     }
-    friend bool operator!=(const const_iterator& a, const const_iterator& b) { return !(a == b); }
+    friend bool operator!=(const const_iterator& a, const const_iterator& b) {
+      return !(a == b);
+    }
 
    private:
     const concurrent_vector* v_;
@@ -195,7 +221,9 @@ class concurrent_vector {
   };
 
   iterator begin() noexcept { return iterator(this, 0); }
-  iterator end() noexcept { return iterator(this, size()); }  // snapshot at call time
+  iterator end() noexcept {
+    return iterator(this, size());
+  }  // snapshot at call time
 
   const_iterator begin() const noexcept { return const_iterator(this, 0); }
   const_iterator end() const noexcept { return const_iterator(this, size()); }
@@ -205,13 +233,15 @@ class concurrent_vector {
 
   T& back() {
     const std::size_t n = size();
-    if (n == 0) throw std::out_of_range("concurrent_vector::back on empty");
+    if (n == 0)
+      throw std::out_of_range("concurrent_vector::back on empty");
     return (*this)[n - 1];
   }
 
   const T& back() const {
     const std::size_t n = size();
-    if (n == 0) throw std::out_of_range("concurrent_vector::back on empty");
+    if (n == 0)
+      throw std::out_of_range("concurrent_vector::back on empty");
     return (*this)[n - 1];
   }
   // -------------------------
@@ -235,16 +265,19 @@ class concurrent_vector {
 
 #if defined(__GNUG__) || defined(__clang__)
     if constexpr (sizeof(std::size_t) == 8) {
-      return 63u - static_cast<std::size_t>(
-                       __builtin_clzll(static_cast<unsigned long long>(x)));
+      return 63u
+             - static_cast<std::size_t>(
+                 __builtin_clzll(static_cast<unsigned long long>(x)));
     } else {
-      return 31u - static_cast<std::size_t>(
-                       __builtin_clzl(static_cast<unsigned long>(x)));
+      return 31u
+             - static_cast<std::size_t>(
+                 __builtin_clzl(static_cast<unsigned long>(x)));
     }
 #else
     std::size_t s = 0;
     std::size_t t = x;
-    while (t >>= 1) ++s;
+    while (t >>= 1)
+      ++s;
     return s;
 #endif
   }
@@ -263,7 +296,8 @@ class concurrent_vector {
 
   T* ensure_segment_(std::size_t s) {
     T* seg = segment_ptr_(s);
-    if (seg) return seg;
+    if (seg)
+      return seg;
 
     const std::size_t n = segment_size_(s);
     void* raw = ::operator new(sizeof(T) * n);
@@ -299,7 +333,8 @@ class concurrent_vector {
 
     for (auto& a : segments_) {
       void* p = a.exchange(nullptr, std::memory_order_acq_rel);
-      if (p) ::operator delete(p);
+      if (p)
+        ::operator delete(p);
     }
   }
 
@@ -317,7 +352,8 @@ class concurrent_vector {
 
   void copy_from_(const concurrent_vector& other) {
     const std::size_t n = other.size();
-    if (n == 0) return;
+    if (n == 0)
+      return;
 
     reserve(n);
     // Important: we want size_ to match, but we must construct elements.
@@ -326,7 +362,7 @@ class concurrent_vector {
       emplace_back(other[i]);
     }
   }
-  
+
   std::atomic<std::size_t> size_;
   std::array<std::atomic<void*>, MaxSegments> segments_;
 };
diff --git a/stan/math/opencl/kernel_cl.hpp b/stan/math/opencl/kernel_cl.hpp
index cd97bda09c0..3dda705d331 100644
--- a/stan/math/opencl/kernel_cl.hpp
+++ b/stan/math/opencl/kernel_cl.hpp
@@ -109,12 +109,14 @@ inline void assign_events(const cl::Event& new_event, CallArg& m,
  * @return A vector of OpenCL events.
  */
 template <typename T, require_not_matrix_cl_t<T>* = nullptr>
-inline stan::math::internal::concurrent_vector<cl::Event> select_events(const T& m) {
+inline stan::math::internal::concurrent_vector<cl::Event> select_events(
+    const T& m) {
   return stan::math::internal::concurrent_vector<cl::Event>{};
 }
 template <typename T, typename K, require_matrix_cl_t<K>* = nullptr,
           require_same_t<T, in_buffer>* = nullptr>
-inline const stan::math::internal::concurrent_vector<cl::Event>& select_events(const K& m) {
+inline const stan::math::internal::concurrent_vector<cl::Event>& select_events(
+    const K& m) {
   return m.write_events();
 }
 template <typename T, typename K, require_matrix_cl_t<K>* = nullptr,
diff --git a/stan/math/opencl/opencl_context.hpp b/stan/math/opencl/opencl_context.hpp
index 6f83296e39b..53e65a05c1d 100644
--- a/stan/math/opencl/opencl_context.hpp
+++ b/stan/math/opencl/opencl_context.hpp
@@ -420,7 +420,7 @@ class opencl_context {
     for (std::size_t i = 0; i < kernel_caches_.size(); ++i) {
       cl::Kernel*& cache = kernel_caches_[i];
       if (cache) {
-	*cache = cl::Kernel();
+        *cache = cl::Kernel();
       }
     }
     kernel_caches_.clear();

From 00c81b86d3065919de136904568a1530ae842596 Mon Sep 17 00:00:00 2001
From: Daniel Lee <bearlee@alum.mit.edu>
Date: Tue, 23 Dec 2025 21:40:50 -0500
Subject: [PATCH 16/17] update

---
 stan/math/opencl/opencl_context.hpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/stan/math/opencl/opencl_context.hpp b/stan/math/opencl/opencl_context.hpp
index 53e65a05c1d..b0c435e33e0 100644
--- a/stan/math/opencl/opencl_context.hpp
+++ b/stan/math/opencl/opencl_context.hpp
@@ -417,11 +417,8 @@ class opencl_context {
    * @param instance_id if of the device
    */
   inline void select_device(int platform_id, int instance_id) {
-    for (std::size_t i = 0; i < kernel_caches_.size(); ++i) {
-      cl::Kernel*& cache = kernel_caches_[i];
-      if (cache) {
-        *cache = cl::Kernel();
-      }
+    for (cl::Kernel* cache : kernel_caches_) {
+      *cache = cl::Kernel();
     }
     kernel_caches_.clear();
     opencl_context_base::select_device(platform_id, instance_id);

From e95bf022434185c59927ab374e796857a59d8a1c Mon Sep 17 00:00:00 2001
From: Daniel Lee <bearlee@alum.mit.edu>
Date: Wed, 24 Dec 2025 16:43:06 -0500
Subject: [PATCH 17/17] cpplint

---
 stan/math/rev/core/team_thread_pool.hpp       | 4 ++--
 stan/math/rev/functor/map_rect_concurrent.hpp | 3 ---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/stan/math/rev/core/team_thread_pool.hpp b/stan/math/rev/core/team_thread_pool.hpp
index 2a77ff644b9..90db011cb7e 100644
--- a/stan/math/rev/core/team_thread_pool.hpp
+++ b/stan/math/rev/core/team_thread_pool.hpp
@@ -146,10 +146,10 @@ class TeamThreadPool {
     if (!s || !*s)
       return 0;
     char* end = nullptr;
-    long v = std::strtol(s, &end, 10);
+    std::size_t v = static_cast<std::size_t>(std::strtol(s, &end, 10));
     if (end == s || v <= 0)
       return 0;
-    return static_cast<std::size_t>(v);
+    return v;
   }
 
   static std::size_t configured_cap_(std::size_t hw) noexcept {
diff --git a/stan/math/rev/functor/map_rect_concurrent.hpp b/stan/math/rev/functor/map_rect_concurrent.hpp
index 76d38866885..fe2dee2a380 100644
--- a/stan/math/rev/functor/map_rect_concurrent.hpp
+++ b/stan/math/rev/functor/map_rect_concurrent.hpp
@@ -9,9 +9,6 @@
 #include <stan/math/rev/core/chainablestack.hpp>
 #include <stan/math/rev/core/team_thread_pool.hpp>
 
-//#include <tbb/parallel_for.h>
-//#include <tbb/blocked_range.h>
-
 #include <algorithm>
 #include <numeric>
 #include <thread>