From cb6d747adab323653092a0d06d9654e2fb70f142 Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Tue, 30 Dec 2025 23:11:33 +0000 Subject: [PATCH] Optimize _gridmake2_torch The optimized code achieves a **7% speedup** by replacing `torch.column_stack()` with a more efficient combination of `unsqueeze(1)` and `torch.cat()`. **Key optimization:** - **Original approach**: Uses `torch.column_stack([first, second])` which internally creates intermediate column vectors and then stacks them. - **Optimized approach**: Explicitly adds dimensions with `unsqueeze(1)` and concatenates with `torch.cat([first, second], dim=1)`. **Why this is faster:** In PyTorch, `torch.column_stack()` is a convenience wrapper that performs multiple operations under the hood. By manually controlling the reshape operations with `unsqueeze(1)` and using `torch.cat()` directly, the optimized version: 1. Reduces function call overhead 2. Gives PyTorch's optimizer more explicit control over memory layout 3. Avoids potential intermediate tensor allocations that `column_stack` may create **Performance characteristics from test results:** - **Small tensors (< 100 elements)**: Shows 0-10% performance variation, sometimes slightly slower due to overhead of additional `unsqueeze` calls - **Medium to large tensors (1000+ elements)**: Shows consistent **8-18% speedups**, where the benefits of explicit dimension control outweigh the overhead - **Best performance**: Large-scale cartesian products like `test_large_scale_memory_efficiency` (18.4% faster) and `test_large_scale_2d_1d` (15.4% faster) **Impact on workloads:** Based on the `function_references`, this function is called in GPU benchmark loops within `bench_gridmake2_torch.py`, where it processes tensors ranging from small (100 elements) to very large (250,000 rows). The optimization particularly benefits: - GPU workloads with medium to large tensor sizes - Hot paths in numerical computations requiring repeated cartesian products - Scenarios where memory bandwidth is a bottleneck (explicit concatenation is more cache-friendly) The optimization maintains identical functional behavior while providing measurable performance improvements for the most common use cases in computational economics applications. --- code_to_optimize/discrete_riccati.py | 43 ++++++++++++---------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/code_to_optimize/discrete_riccati.py b/code_to_optimize/discrete_riccati.py index 53fe30891..6daf0748c 100644 --- a/code_to_optimize/discrete_riccati.py +++ b/code_to_optimize/discrete_riccati.py @@ -1,5 +1,4 @@ -""" -Utility functions used in CompEcon +"""Utility functions used in CompEcon Based routines found in the CompEcon toolbox by Miranda and Fackler. @@ -9,14 +8,15 @@ and Finance, MIT Press, 2002. """ + from functools import reduce + import numpy as np import torch def ckron(*arrays): - """ - Repeatedly applies the np.kron function to an arbitrary number of + """Repeatedly applies the np.kron function to an arbitrary number of input arrays Parameters @@ -43,8 +43,7 @@ def ckron(*arrays): def gridmake(*arrays): - """ - Expands one or more vectors (or matrices) into a matrix where rows span the + """Expands one or more vectors (or matrices) into a matrix where rows span the cartesian product of combinations of the input arrays. Each column of the input arrays will correspond to one column of the output matrix. @@ -79,13 +78,11 @@ def gridmake(*arrays): out = _gridmake2(out, arr) return out - else: - raise NotImplementedError("Come back here") + raise NotImplementedError("Come back here") def _gridmake2(x1, x2): - """ - Expands two vectors (or matrices) into a matrix where rows span the + """Expands two vectors (or matrices) into a matrix where rows span the cartesian product of combinations of the input arrays. Each column of the input arrays will correspond to one column of the output matrix. @@ -114,19 +111,16 @@ def _gridmake2(x1, x2): """ if x1.ndim == 1 and x2.ndim == 1: - return np.column_stack([np.tile(x1, x2.shape[0]), - np.repeat(x2, x1.shape[0])]) - elif x1.ndim > 1 and x2.ndim == 1: + return np.column_stack([np.tile(x1, x2.shape[0]), np.repeat(x2, x1.shape[0])]) + if x1.ndim > 1 and x2.ndim == 1: first = np.tile(x1, (x2.shape[0], 1)) second = np.repeat(x2, x1.shape[0]) return np.column_stack([first, second]) - else: - raise NotImplementedError("Come back here") + raise NotImplementedError("Come back here") def _gridmake2_torch(x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: - """ - PyTorch version of _gridmake2. + """PyTorch version of _gridmake2. Expands two tensors into a matrix where rows span the cartesian product of combinations of the input tensors. Each column of the input tensors @@ -158,13 +152,12 @@ def _gridmake2_torch(x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: """ if x1.dim() == 1 and x2.dim() == 1: # tile x1 by x2.shape[0] times, repeat_interleave x2 by x1.shape[0] - first = x1.tile(x2.shape[0]) - second = x2.repeat_interleave(x1.shape[0]) - return torch.column_stack([first, second]) - elif x1.dim() > 1 and x2.dim() == 1: + first = x1.tile(x2.shape[0]).unsqueeze(1) + second = x2.repeat_interleave(x1.shape[0]).unsqueeze(1) + return torch.cat([first, second], dim=1) + if x1.dim() > 1 and x2.dim() == 1: # tile x1 along first dimension first = x1.tile(x2.shape[0], 1) - second = x2.repeat_interleave(x1.shape[0]) - return torch.column_stack([first, second]) - else: - raise NotImplementedError("Come back here") + second = x2.repeat_interleave(x1.shape[0]).unsqueeze(1) + return torch.cat([first, second], dim=1) + raise NotImplementedError("Come back here")