diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 23a4b79e02022..5ceeee57b0be4 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -275,6 +275,11 @@ harness = false name = "ends_with" required-features = ["string_expressions"] +[[bench]] +harness = false +name = "regexp_count" +required-features = ["regex_expressions"] + [[bench]] harness = false name = "crypto" diff --git a/datafusion/functions/benches/regexp_count.rs b/datafusion/functions/benches/regexp_count.rs new file mode 100644 index 0000000000000..eae7ef00f16bd --- /dev/null +++ b/datafusion/functions/benches/regexp_count.rs @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::array::Int64Array; +use arrow::array::OffsetSizeTrait; +use arrow::datatypes::{DataType, Field}; +use arrow::util::bench_util::create_string_array_with_len; +use criterion::{Criterion, SamplingMode, criterion_group, criterion_main}; +use datafusion_common::config::ConfigOptions; +use datafusion_common::{DataFusionError, ScalarValue}; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; +use datafusion_functions::regex; +use std::hint::black_box; +use std::sync::Arc; +use std::time::Duration; + +fn create_args( + size: usize, + str_len: usize, + with_start: bool, +) -> Vec { + let string_array = Arc::new(create_string_array_with_len::(size, 0.1, str_len)); + + // Use a simple pattern that matches common characters + let pattern = ColumnarValue::Scalar(ScalarValue::Utf8(Some("a".to_string()))); + + if with_start { + // Test with start position (this is where the optimization matters) + let start_array = Arc::new(Int64Array::from( + (0..size).map(|i| (i % 10 + 1) as i64).collect::>(), + )); + vec![ + ColumnarValue::Array(string_array), + pattern, + ColumnarValue::Array(start_array), + ] + } else { + vec![ColumnarValue::Array(string_array), pattern] + } +} + +fn invoke_regexp_count_with_args( + args: Vec, + number_rows: usize, +) -> Result { + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into()) + .collect::>(); + let config_options = Arc::new(ConfigOptions::default()); + + regex::regexp_count().invoke_with_args(ScalarFunctionArgs { + args, + arg_fields, + number_rows, + return_field: Field::new("f", DataType::Int64, true).into(), + config_options: Arc::clone(&config_options), + }) +} + +fn criterion_benchmark(c: &mut Criterion) { + for size in [1024, 4096] { + let mut group = c.benchmark_group(format!("regexp_count size={size}")); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + group.measurement_time(Duration::from_secs(10)); + + // Test without start position (no optimization impact) + for str_len in [32, 128] { + let args = create_args::(size, str_len, false); + group.bench_function( + format!("regexp_count_no_start [size={size}, str_len={str_len}]"), + |b| { + b.iter(|| { + let args_cloned = args.clone(); + black_box(invoke_regexp_count_with_args(args_cloned, size)) + }) + }, + ); + } + + // Test with start position (optimization should help here) + for str_len in [32, 128] { + let args = create_args::(size, str_len, true); + group.bench_function( + format!("regexp_count_with_start [size={size}, str_len={str_len}]"), + |b| { + b.iter(|| { + let args_cloned = args.clone(); + black_box(invoke_regexp_count_with_args(args_cloned, size)) + }) + }, + ); + } + + group.finish(); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/regex/regexpcount.rs b/datafusion/functions/src/regex/regexpcount.rs index de4c657dd0d20..7792d8098a4ef 100644 --- a/datafusion/functions/src/regex/regexpcount.rs +++ b/datafusion/functions/src/regex/regexpcount.rs @@ -569,8 +569,16 @@ fn count_matches( )); } - let find_slice = value.chars().skip(start as usize - 1).collect::(); - let count = pattern.find_iter(find_slice.as_str()).count(); + // Find the byte offset for the start position (1-based character index) + let byte_offset = value + .char_indices() + .nth((start as usize).saturating_sub(1)) + .map(|(idx, _)| idx) + .unwrap_or(value.len()); + + // Use string slicing instead of collecting chars into a new String + let find_slice = &value[byte_offset..]; + let count = pattern.find_iter(find_slice).count(); Ok(count as i64) } else { let count = pattern.find_iter(value).count();