From dbdc19c305844084492621e8936b9e10c60f0692 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 29 Dec 2025 10:23:01 -0800 Subject: [PATCH] perf: optimize regexp_count to avoid String allocation when start position is provided MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace `.chars().skip().collect::()` with zero-copy string slicing using `char_indices()` to find the byte offset, then slice with `&value[byte_offset..]`. This eliminates unnecessary String allocation per row when a start position is specified. Changes: - Use char_indices().nth() to find byte offset for start position (1-based) - Use string slicing &value[byte_offset..] instead of collecting chars - Added benchmark to measure performance improvements Optimization: - Before: Allocated new String via .collect() for each row with start position - After: Uses zero-copy string slice Benchmark results: - size=1024, str_len=32: 96.361 µs -> 41.458 µs (57.0% faster, 2.3x speedup) - size=1024, str_len=128: 210.16 µs -> 56.064 µs (73.3% faster, 3.7x speedup) - size=4096, str_len=32: 376.90 µs -> 162.98 µs (56.8% faster, 2.3x speedup) - size=4096, str_len=128: 855.68 µs -> 263.61 µs (69.2% faster, 3.2x speedup) The optimization shows greater improvements for longer strings (up to 73% faster) since string slicing is O(1) regardless of length, while the previous approach had allocation costs that grew with string length. --- datafusion/functions/Cargo.toml | 5 + datafusion/functions/benches/regexp_count.rs | 118 ++++++++++++++++++ datafusion/functions/src/regex/regexpcount.rs | 12 +- 3 files changed, 133 insertions(+), 2 deletions(-) create mode 100644 datafusion/functions/benches/regexp_count.rs diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index d85a269c7fa71..524ad105858a3 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -270,6 +270,11 @@ harness = false name = "ends_with" required-features = ["string_expressions"] +[[bench]] +harness = false +name = "regexp_count" +required-features = ["regex_expressions"] + [[bench]] harness = false name = "translate" diff --git a/datafusion/functions/benches/regexp_count.rs b/datafusion/functions/benches/regexp_count.rs new file mode 100644 index 0000000000000..eae7ef00f16bd --- /dev/null +++ b/datafusion/functions/benches/regexp_count.rs @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::array::Int64Array; +use arrow::array::OffsetSizeTrait; +use arrow::datatypes::{DataType, Field}; +use arrow::util::bench_util::create_string_array_with_len; +use criterion::{Criterion, SamplingMode, criterion_group, criterion_main}; +use datafusion_common::config::ConfigOptions; +use datafusion_common::{DataFusionError, ScalarValue}; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; +use datafusion_functions::regex; +use std::hint::black_box; +use std::sync::Arc; +use std::time::Duration; + +fn create_args( + size: usize, + str_len: usize, + with_start: bool, +) -> Vec { + let string_array = Arc::new(create_string_array_with_len::(size, 0.1, str_len)); + + // Use a simple pattern that matches common characters + let pattern = ColumnarValue::Scalar(ScalarValue::Utf8(Some("a".to_string()))); + + if with_start { + // Test with start position (this is where the optimization matters) + let start_array = Arc::new(Int64Array::from( + (0..size).map(|i| (i % 10 + 1) as i64).collect::>(), + )); + vec![ + ColumnarValue::Array(string_array), + pattern, + ColumnarValue::Array(start_array), + ] + } else { + vec![ColumnarValue::Array(string_array), pattern] + } +} + +fn invoke_regexp_count_with_args( + args: Vec, + number_rows: usize, +) -> Result { + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into()) + .collect::>(); + let config_options = Arc::new(ConfigOptions::default()); + + regex::regexp_count().invoke_with_args(ScalarFunctionArgs { + args, + arg_fields, + number_rows, + return_field: Field::new("f", DataType::Int64, true).into(), + config_options: Arc::clone(&config_options), + }) +} + +fn criterion_benchmark(c: &mut Criterion) { + for size in [1024, 4096] { + let mut group = c.benchmark_group(format!("regexp_count size={size}")); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + group.measurement_time(Duration::from_secs(10)); + + // Test without start position (no optimization impact) + for str_len in [32, 128] { + let args = create_args::(size, str_len, false); + group.bench_function( + format!("regexp_count_no_start [size={size}, str_len={str_len}]"), + |b| { + b.iter(|| { + let args_cloned = args.clone(); + black_box(invoke_regexp_count_with_args(args_cloned, size)) + }) + }, + ); + } + + // Test with start position (optimization should help here) + for str_len in [32, 128] { + let args = create_args::(size, str_len, true); + group.bench_function( + format!("regexp_count_with_start [size={size}, str_len={str_len}]"), + |b| { + b.iter(|| { + let args_cloned = args.clone(); + black_box(invoke_regexp_count_with_args(args_cloned, size)) + }) + }, + ); + } + + group.finish(); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/regex/regexpcount.rs b/datafusion/functions/src/regex/regexpcount.rs index de4c657dd0d20..7792d8098a4ef 100644 --- a/datafusion/functions/src/regex/regexpcount.rs +++ b/datafusion/functions/src/regex/regexpcount.rs @@ -569,8 +569,16 @@ fn count_matches( )); } - let find_slice = value.chars().skip(start as usize - 1).collect::(); - let count = pattern.find_iter(find_slice.as_str()).count(); + // Find the byte offset for the start position (1-based character index) + let byte_offset = value + .char_indices() + .nth((start as usize).saturating_sub(1)) + .map(|(idx, _)| idx) + .unwrap_or(value.len()); + + // Use string slicing instead of collecting chars into a new String + let find_slice = &value[byte_offset..]; + let count = pattern.find_iter(find_slice).count(); Ok(count as i64) } else { let count = pattern.find_iter(value).count();