From f9fa158feb1fc21fbe85311f5b643028885f0c68 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Thu, 18 Dec 2025 19:53:41 -0800 Subject: [PATCH 1/5] PERF: use PyArrow-native implementation for dt.total_seconds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid conversion to TimedeltaArray by using PyArrow compute directly. Cast duration to int64, then to float64, and multiply by unit factor. ~3.7x speedup (3.53ms -> 0.96ms for 1M rows). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- pandas/core/arrays/arrow/array.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index edf1d7ddcaa76..ffc4b39427176 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2915,7 +2915,13 @@ def _dt_to_pytimedelta(self) -> np.ndarray: return np.array(data, dtype=object) def _dt_total_seconds(self) -> Self: - result = pa.array(self._to_timedeltaarray().total_seconds(), from_pandas=True) + # Convert duration to seconds using PyArrow compute + # Must cast to int64 first since duration -> float64 is not supported + unit = self._pa_array.type.unit + unit_to_seconds = {"s": 1.0, "ms": 1e-3, "us": 1e-6, "ns": 1e-9} + factor = unit_to_seconds[unit] + int_arr = pc.cast(self._pa_array, pa.int64()) + result = pc.multiply(pc.cast(int_arr, pa.float64()), factor) return self._from_pyarrow_array(result) def _dt_as_unit(self, unit: str) -> Self: From f20b03de9f21d7105d294fd306a6272d5721cee0 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Thu, 18 Dec 2025 19:59:49 -0800 Subject: [PATCH 2/5] CLN: combine nested casts in _dt_total_seconds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- pandas/core/arrays/arrow/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index ffc4b39427176..5695b99f31a93 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2920,8 +2920,8 @@ def _dt_total_seconds(self) -> Self: unit = self._pa_array.type.unit unit_to_seconds = {"s": 1.0, "ms": 1e-3, "us": 1e-6, "ns": 1e-9} factor = unit_to_seconds[unit] - int_arr = pc.cast(self._pa_array, pa.int64()) - result = pc.multiply(pc.cast(int_arr, pa.float64()), factor) + float_arr = pc.cast(pc.cast(self._pa_array, pa.int64()), pa.float64()) + result = pc.multiply(float_arr, factor) return self._from_pyarrow_array(result) def _dt_as_unit(self, unit: str) -> Self: From 138cdf8789df21b927f526299e4572c87055b4f0 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Thu, 18 Dec 2025 20:02:49 -0800 Subject: [PATCH 3/5] CLN: remove unnecessary float64 cast in _dt_total_seconds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PyArrow automatically promotes int64 to double when multiplying with float. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- pandas/core/arrays/arrow/array.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 5695b99f31a93..1d1fbc9ecb79b 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2916,12 +2916,10 @@ def _dt_to_pytimedelta(self) -> np.ndarray: def _dt_total_seconds(self) -> Self: # Convert duration to seconds using PyArrow compute - # Must cast to int64 first since duration -> float64 is not supported unit = self._pa_array.type.unit unit_to_seconds = {"s": 1.0, "ms": 1e-3, "us": 1e-6, "ns": 1e-9} factor = unit_to_seconds[unit] - float_arr = pc.cast(pc.cast(self._pa_array, pa.int64()), pa.float64()) - result = pc.multiply(float_arr, factor) + result = pc.multiply(pc.cast(self._pa_array, pa.int64()), factor) return self._from_pyarrow_array(result) def _dt_as_unit(self, unit: str) -> Self: From f7b829978e49b7409fdba0b8dfa977235151dd50 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Fri, 19 Dec 2025 10:40:18 -0800 Subject: [PATCH 4/5] use divide instead of multiply --- pandas/core/arrays/arrow/array.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 100912507cbea..92b84a696b521 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2917,11 +2917,9 @@ def _dt_to_pytimedelta(self) -> np.ndarray: return np.array(data, dtype=object) def _dt_total_seconds(self) -> Self: - # Convert duration to seconds using PyArrow compute unit = self._pa_array.type.unit - unit_to_seconds = {"s": 1.0, "ms": 1e-3, "us": 1e-6, "ns": 1e-9} - factor = unit_to_seconds[unit] - result = pc.multiply(pc.cast(self._pa_array, pa.int64()), factor) + unit_per_second = {"s": 1, "ms": 1000, "us": 1_000_000, "ns": 1_000_000_000} + result = pc.divide(pc.cast(self._pa_array, pa.int64()), unit_per_second[unit]) return self._from_pyarrow_array(result) def _dt_as_unit(self, unit: str) -> Self: From ea827f7cc206f8eb0521802c1158ce0bd6bcf178 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Fri, 19 Dec 2025 11:14:22 -0800 Subject: [PATCH 5/5] change factor to float --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 92b84a696b521..9509fbc739151 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2918,7 +2918,7 @@ def _dt_to_pytimedelta(self) -> np.ndarray: def _dt_total_seconds(self) -> Self: unit = self._pa_array.type.unit - unit_per_second = {"s": 1, "ms": 1000, "us": 1_000_000, "ns": 1_000_000_000} + unit_per_second = {"s": 1.0, "ms": 1e3, "us": 1e6, "ns": 1e9} result = pc.divide(pc.cast(self._pa_array, pa.int64()), unit_per_second[unit]) return self._from_pyarrow_array(result)