From cb6d747adab323653092a0d06d9654e2fb70f142 Mon Sep 17 00:00:00 2001
From: "codeflash-ai[bot]"
 <148906541+codeflash-ai[bot]@users.noreply.github.com>
Date: Tue, 30 Dec 2025 23:11:33 +0000
Subject: [PATCH] Optimize _gridmake2_torch

The optimized code achieves a **7% speedup** by replacing `torch.column_stack()` with a more efficient combination of `unsqueeze(1)` and `torch.cat()`.

**Key optimization:**
- **Original approach**: Uses `torch.column_stack([first, second])` which internally creates intermediate column vectors and then stacks them.
- **Optimized approach**: Explicitly adds dimensions with `unsqueeze(1)` and concatenates with `torch.cat([first, second], dim=1)`.

**Why this is faster:**
In PyTorch, `torch.column_stack()` is a convenience wrapper that performs multiple operations under the hood. By manually controlling the reshape operations with `unsqueeze(1)` and using `torch.cat()` directly, the optimized version:
1. Reduces function call overhead
2. Gives PyTorch's optimizer more explicit control over memory layout
3. Avoids potential intermediate tensor allocations that `column_stack` may create

**Performance characteristics from test results:**
- **Small tensors (< 100 elements)**: Shows 0-10% performance variation, sometimes slightly slower due to overhead of additional `unsqueeze` calls
- **Medium to large tensors (1000+ elements)**: Shows consistent **8-18% speedups**, where the benefits of explicit dimension control outweigh the overhead
- **Best performance**: Large-scale cartesian products like `test_large_scale_memory_efficiency` (18.4% faster) and `test_large_scale_2d_1d` (15.4% faster)

**Impact on workloads:**
Based on the `function_references`, this function is called in GPU benchmark loops within `bench_gridmake2_torch.py`, where it processes tensors ranging from small (100 elements) to very large (250,000 rows). The optimization particularly benefits:
- GPU workloads with medium to large tensor sizes
- Hot paths in numerical computations requiring repeated cartesian products
- Scenarios where memory bandwidth is a bottleneck (explicit concatenation is more cache-friendly)

The optimization maintains identical functional behavior while providing measurable performance improvements for the most common use cases in computational economics applications.
---
 code_to_optimize/discrete_riccati.py | 43 ++++++++++++----------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/code_to_optimize/discrete_riccati.py b/code_to_optimize/discrete_riccati.py
index 53fe30891..6daf0748c 100644
--- a/code_to_optimize/discrete_riccati.py
+++ b/code_to_optimize/discrete_riccati.py
@@ -1,5 +1,4 @@
-"""
-Utility functions used in CompEcon
+"""Utility functions used in CompEcon
 
 Based routines found in the CompEcon toolbox by Miranda and Fackler.
 
@@ -9,14 +8,15 @@
 and Finance, MIT Press, 2002.
 
 """
+
 from functools import reduce
+
 import numpy as np
 import torch
 
 
 def ckron(*arrays):
-    """
-    Repeatedly applies the np.kron function to an arbitrary number of
+    """Repeatedly applies the np.kron function to an arbitrary number of
     input arrays
 
     Parameters
@@ -43,8 +43,7 @@ def ckron(*arrays):
 
 
 def gridmake(*arrays):
-    """
-    Expands one or more vectors (or matrices) into a matrix where rows span the
+    """Expands one or more vectors (or matrices) into a matrix where rows span the
     cartesian product of combinations of the input arrays. Each column of the
     input arrays will correspond to one column of the output matrix.
 
@@ -79,13 +78,11 @@ def gridmake(*arrays):
                 out = _gridmake2(out, arr)
 
         return out
-    else:
-        raise NotImplementedError("Come back here")
+    raise NotImplementedError("Come back here")
 
 
 def _gridmake2(x1, x2):
-    """
-    Expands two vectors (or matrices) into a matrix where rows span the
+    """Expands two vectors (or matrices) into a matrix where rows span the
     cartesian product of combinations of the input arrays. Each column of the
     input arrays will correspond to one column of the output matrix.
 
@@ -114,19 +111,16 @@ def _gridmake2(x1, x2):
 
     """
     if x1.ndim == 1 and x2.ndim == 1:
-        return np.column_stack([np.tile(x1, x2.shape[0]),
-                               np.repeat(x2, x1.shape[0])])
-    elif x1.ndim > 1 and x2.ndim == 1:
+        return np.column_stack([np.tile(x1, x2.shape[0]), np.repeat(x2, x1.shape[0])])
+    if x1.ndim > 1 and x2.ndim == 1:
         first = np.tile(x1, (x2.shape[0], 1))
         second = np.repeat(x2, x1.shape[0])
         return np.column_stack([first, second])
-    else:
-        raise NotImplementedError("Come back here")
+    raise NotImplementedError("Come back here")
 
 
 def _gridmake2_torch(x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
-    """
-    PyTorch version of _gridmake2.
+    """PyTorch version of _gridmake2.
 
     Expands two tensors into a matrix where rows span the cartesian product
     of combinations of the input tensors. Each column of the input tensors
@@ -158,13 +152,12 @@ def _gridmake2_torch(x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
     """
     if x1.dim() == 1 and x2.dim() == 1:
         # tile x1 by x2.shape[0] times, repeat_interleave x2 by x1.shape[0]
-        first = x1.tile(x2.shape[0])
-        second = x2.repeat_interleave(x1.shape[0])
-        return torch.column_stack([first, second])
-    elif x1.dim() > 1 and x2.dim() == 1:
+        first = x1.tile(x2.shape[0]).unsqueeze(1)
+        second = x2.repeat_interleave(x1.shape[0]).unsqueeze(1)
+        return torch.cat([first, second], dim=1)
+    if x1.dim() > 1 and x2.dim() == 1:
         # tile x1 along first dimension
         first = x1.tile(x2.shape[0], 1)
-        second = x2.repeat_interleave(x1.shape[0])
-        return torch.column_stack([first, second])
-    else:
-        raise NotImplementedError("Come back here")
+        second = x2.repeat_interleave(x1.shape[0]).unsqueeze(1)
+        return torch.cat([first, second], dim=1)
+    raise NotImplementedError("Come back here")