Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datafusion/functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ required-features = ["unicode_expressions"]

[[bench]]
harness = false
name = "ltrim"
name = "trim"
required-features = ["string_expressions"]

[[bench]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,28 +48,58 @@ impl fmt::Display for StringArrayType {
}
}

/// returns an array of strings, and `characters` as a ScalarValue
pub fn create_string_array_and_characters(
#[derive(Clone, Copy)]
pub enum TrimType {
Ltrim,
Rtrim,
Btrim,
}

impl fmt::Display for TrimType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
TrimType::Ltrim => f.write_str("ltrim"),
TrimType::Rtrim => f.write_str("rtrim"),
TrimType::Btrim => f.write_str("btrim"),
}
}
}

/// Returns an array of strings with trim characters positioned according to trim type,
/// and `characters` as a ScalarValue.
///
/// For ltrim: trim characters are at the start (prefix)
/// For rtrim: trim characters are at the end (suffix)
/// For btrim: trim characters are at both start and end
fn create_string_array_and_characters(
size: usize,
characters: &str,
trimmed: &str,
remaining_len: usize,
string_array_type: StringArrayType,
trim_type: TrimType,
) -> (ArrayRef, ScalarValue) {
let rng = &mut StdRng::seed_from_u64(42);

// Create `size` rows:
// - 10% rows will be `None`
// - Other 90% will be strings with same `remaining_len` lengths
// We will build the string array on it later.
// - Other 90% will be strings with `remaining_len` content length
let string_iter = (0..size).map(|_| {
if rng.random::<f32>() < 0.1 {
None
} else {
let mut value = trimmed.as_bytes().to_vec();
let generated = rng.sample_iter(&Alphanumeric).take(remaining_len);
value.extend(generated);
Some(String::from_utf8(value).unwrap())
let content: String = rng
.sample_iter(&Alphanumeric)
.take(remaining_len)
.map(char::from)
.collect();

let value = match trim_type {
TrimType::Ltrim => format!("{trimmed}{content}"),
TrimType::Rtrim => format!("{content}{trimmed}"),
TrimType::Btrim => format!("{trimmed}{content}{trimmed}"),
};
Comment on lines +97 to +101
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
let value = match trim_type {
TrimType::Ltrim => format!("{trimmed}{content}"),
TrimType::Rtrim => format!("{content}{trimmed}"),
TrimType::Btrim => format!("{trimmed}{content}{trimmed}"),
};
let value = {
let (prefix, suffix) = match trim_type {
TrimType::Ltrim => (trimmed, ""),
TrimType::Rtrim => ("", trimmed),
TrimType::Btrim => (trimmed, trimmed),
};
let mut s = String::with_capacity(
prefix.len() + content.len() + suffix.len()
);
s.push_str(prefix);
s.push_str(content);
s.push_str(suffix);
s
};

Slightly more verbose but less string allocations on concats

Some(value)
}
});

Expand All @@ -90,30 +120,22 @@ pub fn create_string_array_and_characters(
}
}

/// Create args for the ltrim benchmark
/// Inputs:
/// - size: rows num of the test array
/// - characters: the characters we need to trim
/// - trimmed: the part in the testing string that will be trimmed
/// - remaining_len: the len of the remaining part of testing string after trimming
/// - string_array_type: the method used to store the testing strings
///
/// Outputs:
/// - testing string array
/// - trimmed characters
/// Create args for the trim benchmark
fn create_args(
size: usize,
characters: &str,
trimmed: &str,
remaining_len: usize,
string_array_type: StringArrayType,
trim_type: TrimType,
) -> Vec<ColumnarValue> {
let (string_array, pattern) = create_string_array_and_characters(
size,
characters,
trimmed,
remaining_len,
string_array_type,
trim_type,
);
vec![
ColumnarValue::Array(string_array),
Expand All @@ -124,15 +146,23 @@ fn create_args(
#[allow(clippy::too_many_arguments)]
fn run_with_string_type<M: Measurement>(
group: &mut BenchmarkGroup<'_, M>,
ltrim: &ScalarUDF,
trim_func: &ScalarUDF,
trim_type: TrimType,
size: usize,
len: usize,
total_len: usize,
characters: &str,
trimmed: &str,
remaining_len: usize,
string_type: StringArrayType,
) {
let args = create_args(size, characters, trimmed, remaining_len, string_type);
let args = create_args(
size,
characters,
trimmed,
remaining_len,
string_type,
trim_type,
);
let arg_fields = args
.iter()
.enumerate()
Expand All @@ -142,12 +172,12 @@ fn run_with_string_type<M: Measurement>(

group.bench_function(
format!(
"{string_type} [size={size}, len_before={len}, len_after={remaining_len}]",
"{trim_type} {string_type} [size={size}, len={total_len}, remaining={remaining_len}]",
),
|b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(ltrim.invoke_with_args(ScalarFunctionArgs {
black_box(trim_func.invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
Expand All @@ -160,13 +190,14 @@ fn run_with_string_type<M: Measurement>(
}

#[allow(clippy::too_many_arguments)]
fn run_one_group(
fn run_trim_benchmark(
c: &mut Criterion,
group_name: &str,
ltrim: &ScalarUDF,
trim_func: &ScalarUDF,
trim_type: TrimType,
string_types: &[StringArrayType],
size: usize,
len: usize,
total_len: usize,
characters: &str,
trimmed: &str,
remaining_len: usize,
Expand All @@ -178,9 +209,10 @@ fn run_one_group(
for string_type in string_types {
run_with_string_type(
&mut group,
ltrim,
trim_func,
trim_type,
size,
len,
total_len,
characters,
trimmed,
remaining_len,
Expand All @@ -193,61 +225,79 @@ fn run_one_group(

fn criterion_benchmark(c: &mut Criterion) {
let ltrim = string::ltrim();
let rtrim = string::rtrim();
let btrim = string::btrim();

let characters = ",!()";

let string_types = [
StringArrayType::Utf8View,
StringArrayType::Utf8,
StringArrayType::LargeUtf8,
];
for size in [1024, 4096, 8192] {
// len=12, trimmed_len=4, len_after_ltrim=8
let len = 12;
let trimmed = characters;
let remaining_len = len - trimmed.len();
run_one_group(
c,
"INPUT LEN <= 12",
&ltrim,
&string_types,
size,
len,
characters,
trimmed,
remaining_len,
);

// len=64, trimmed_len=4, len_after_ltrim=60
let len = 64;
let trimmed = characters;
let remaining_len = len - trimmed.len();
run_one_group(
c,
"INPUT LEN > 12, OUTPUT LEN > 12",
&ltrim,
&string_types,
size,
len,
characters,
trimmed,
remaining_len,
);
let trim_funcs = [
(&ltrim, TrimType::Ltrim),
(&rtrim, TrimType::Rtrim),
(&btrim, TrimType::Btrim),
];

// len=64, trimmed_len=56, len_after_ltrim=8
let len = 64;
let trimmed = characters.repeat(15);
let remaining_len = len - trimmed.len();
run_one_group(
c,
"INPUT LEN > 12, OUTPUT LEN <= 12",
&ltrim,
&string_types,
size,
len,
characters,
&trimmed,
remaining_len,
);
for size in [4096] {
for (trim_func, trim_type) in &trim_funcs {
// Scenario 1: Short strings (len <= 12, inline in StringView)
// trimmed_len=4, remaining_len=8
let total_len = 12;
let trimmed = characters;
let remaining_len = total_len - trimmed.len();
run_trim_benchmark(
c,
"short strings (len <= 12)",
trim_func,
*trim_type,
&string_types,
size,
total_len,
characters,
trimmed,
remaining_len,
);

// Scenario 2: Long strings, short trim (len > 12, output > 12)
// trimmed_len=4, remaining_len=60
let total_len = 64;
let trimmed = characters;
let remaining_len = total_len - trimmed.len();
run_trim_benchmark(
c,
"long strings, short trim",
trim_func,
*trim_type,
&string_types,
size,
total_len,
characters,
trimmed,
remaining_len,
);

// Scenario 3: Long strings, long trim (len > 12, output <= 12)
// trimmed_len=56, remaining_len=8
let total_len = 64;
let trimmed = characters.repeat(14);
let remaining_len = total_len - trimmed.len();
run_trim_benchmark(
c,
"long strings, long trim",
trim_func,
*trim_type,
&string_types,
size,
total_len,
characters,
&trimmed,
remaining_len,
);
}
}
}

Expand Down
Loading