From 90780739bf78de3d92fd10c49c9245eb93e7af18 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 27 Dec 2025 10:06:02 -0700 Subject: [PATCH 1/3] perf: Improve performance of normalize_nan --- native/spark-expr/Cargo.toml | 8 + native/spark-expr/benches/comparison.rs | 337 ++++++++++++++++++ native/spark-expr/benches/normalize_nan.rs | 88 +++++ .../src/math_funcs/internal/normalize_nan.rs | 80 ++--- 4 files changed, 455 insertions(+), 58 deletions(-) create mode 100644 native/spark-expr/benches/comparison.rs create mode 100644 native/spark-expr/benches/normalize_nan.rs diff --git a/native/spark-expr/Cargo.toml b/native/spark-expr/Cargo.toml index ea89c43204..634b3eedbf 100644 --- a/native/spark-expr/Cargo.toml +++ b/native/spark-expr/Cargo.toml @@ -80,6 +80,14 @@ harness = false name = "padding" harness = false +[[bench]] +name = "comparison" +harness = false + +[[bench]] +name = "normalize_nan" +harness = false + [[test]] name = "test_udf_registration" path = "tests/spark_expr_reg.rs" diff --git a/native/spark-expr/benches/comparison.rs b/native/spark-expr/benches/comparison.rs new file mode 100644 index 0000000000..b7b934d787 --- /dev/null +++ b/native/spark-expr/benches/comparison.rs @@ -0,0 +1,337 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmarks for comparison expressions (eq, eq_null_safe, lt, is_null, etc.) + +use arrow::array::builder::{Int64Builder, StringBuilder}; +use arrow::datatypes::{DataType, Field, Schema}; +use arrow::record_batch::RecordBatch; +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +use datafusion::logical_expr::Operator; +use datafusion::physical_expr::expressions::{BinaryExpr, Column, IsNotNullExpr, IsNullExpr}; +use datafusion::physical_expr::PhysicalExpr; +use std::hint::black_box; +use std::sync::Arc; + +const BATCH_SIZE: usize = 8192; + +fn make_col(name: &str, index: usize) -> Arc { + Arc::new(Column::new(name, index)) +} + +/// Create a batch with two int64 columns with a specified null percentage +fn create_int64_batch(null_pct: usize) -> RecordBatch { + let mut c1 = Int64Builder::with_capacity(BATCH_SIZE); + let mut c2 = Int64Builder::with_capacity(BATCH_SIZE); + + for i in 0..BATCH_SIZE { + if null_pct > 0 && i % (100 / null_pct) == 0 { + c1.append_null(); + } else { + c1.append_value(i as i64); + } + if null_pct > 0 && (i + 1) % (100 / null_pct) == 0 { + c2.append_null(); + } else { + c2.append_value((i as i64) % 1000); + } + } + + let schema = Schema::new(vec![ + Field::new("c1", DataType::Int64, null_pct > 0), + Field::new("c2", DataType::Int64, null_pct > 0), + ]); + + RecordBatch::try_new( + Arc::new(schema), + vec![Arc::new(c1.finish()), Arc::new(c2.finish())], + ) + .unwrap() +} + +/// Create a batch with two string columns with a specified null percentage +fn create_string_batch(null_pct: usize) -> RecordBatch { + let mut c1 = StringBuilder::with_capacity(BATCH_SIZE, BATCH_SIZE * 20); + let mut c2 = StringBuilder::with_capacity(BATCH_SIZE, BATCH_SIZE * 20); + + for i in 0..BATCH_SIZE { + if null_pct > 0 && i % (100 / null_pct) == 0 { + c1.append_null(); + } else { + c1.append_value(format!("string value {}", i)); + } + if null_pct > 0 && (i + 1) % (100 / null_pct) == 0 { + c2.append_null(); + } else { + c2.append_value(format!("string value {}", i % 1000)); + } + } + + let schema = Schema::new(vec![ + Field::new("c1", DataType::Utf8, null_pct > 0), + Field::new("c2", DataType::Utf8, null_pct > 0), + ]); + + RecordBatch::try_new( + Arc::new(schema), + vec![Arc::new(c1.finish()), Arc::new(c2.finish())], + ) + .unwrap() +} + +fn bench_int64_equality(c: &mut Criterion) { + let mut group = c.benchmark_group("int64_equality"); + + for null_pct in [0, 10, 50] { + let batch = create_int64_batch(null_pct); + + // Regular equality: c1 = c2 + let eq_expr = Arc::new(BinaryExpr::new( + make_col("c1", 0), + Operator::Eq, + make_col("c2", 1), + )); + + group.bench_with_input( + BenchmarkId::new("eq", format!("null_{}pct", null_pct)), + &batch, + |b, batch| { + b.iter(|| black_box(eq_expr.evaluate(black_box(batch)).unwrap())); + }, + ); + + // Null-safe equality: c1 <=> c2 (IsNotDistinctFrom) + let eq_null_safe_expr = Arc::new(BinaryExpr::new( + make_col("c1", 0), + Operator::IsNotDistinctFrom, + make_col("c2", 1), + )); + + group.bench_with_input( + BenchmarkId::new("eq_null_safe", format!("null_{}pct", null_pct)), + &batch, + |b, batch| { + b.iter(|| black_box(eq_null_safe_expr.evaluate(black_box(batch)).unwrap())); + }, + ); + } + + group.finish(); +} + +fn bench_int64_less_than(c: &mut Criterion) { + let mut group = c.benchmark_group("int64_less_than"); + + for null_pct in [0, 10, 50] { + let batch = create_int64_batch(null_pct); + + // Less than: c1 < c2 + let lt_expr = Arc::new(BinaryExpr::new( + make_col("c1", 0), + Operator::Lt, + make_col("c2", 1), + )); + + group.bench_with_input( + BenchmarkId::new("lt", format!("null_{}pct", null_pct)), + &batch, + |b, batch| { + b.iter(|| black_box(lt_expr.evaluate(black_box(batch)).unwrap())); + }, + ); + + // Less than or equal: c1 <= c2 + let lteq_expr = Arc::new(BinaryExpr::new( + make_col("c1", 0), + Operator::LtEq, + make_col("c2", 1), + )); + + group.bench_with_input( + BenchmarkId::new("lt_eq", format!("null_{}pct", null_pct)), + &batch, + |b, batch| { + b.iter(|| black_box(lteq_expr.evaluate(black_box(batch)).unwrap())); + }, + ); + } + + group.finish(); +} + +fn bench_string_equality(c: &mut Criterion) { + let mut group = c.benchmark_group("string_equality"); + + for null_pct in [0, 10, 50] { + let batch = create_string_batch(null_pct); + + // Regular equality: c1 = c2 + let eq_expr = Arc::new(BinaryExpr::new( + make_col("c1", 0), + Operator::Eq, + make_col("c2", 1), + )); + + group.bench_with_input( + BenchmarkId::new("eq", format!("null_{}pct", null_pct)), + &batch, + |b, batch| { + b.iter(|| black_box(eq_expr.evaluate(black_box(batch)).unwrap())); + }, + ); + + // Null-safe equality: c1 <=> c2 (IsNotDistinctFrom) + let eq_null_safe_expr = Arc::new(BinaryExpr::new( + make_col("c1", 0), + Operator::IsNotDistinctFrom, + make_col("c2", 1), + )); + + group.bench_with_input( + BenchmarkId::new("eq_null_safe", format!("null_{}pct", null_pct)), + &batch, + |b, batch| { + b.iter(|| black_box(eq_null_safe_expr.evaluate(black_box(batch)).unwrap())); + }, + ); + } + + group.finish(); +} + +fn bench_string_less_than(c: &mut Criterion) { + let mut group = c.benchmark_group("string_less_than"); + + for null_pct in [0, 10, 50] { + let batch = create_string_batch(null_pct); + + // Less than: c1 < c2 + let lt_expr = Arc::new(BinaryExpr::new( + make_col("c1", 0), + Operator::Lt, + make_col("c2", 1), + )); + + group.bench_with_input( + BenchmarkId::new("lt", format!("null_{}pct", null_pct)), + &batch, + |b, batch| { + b.iter(|| black_box(lt_expr.evaluate(black_box(batch)).unwrap())); + }, + ); + } + + group.finish(); +} + +fn bench_is_null(c: &mut Criterion) { + let mut group = c.benchmark_group("is_null"); + + for null_pct in [0, 10, 50] { + let batch = create_int64_batch(null_pct); + + // IS NULL check + let is_null_expr = Arc::new(IsNullExpr::new(make_col("c1", 0))); + + group.bench_with_input( + BenchmarkId::new("int64_is_null", format!("null_{}pct", null_pct)), + &batch, + |b, batch| { + b.iter(|| black_box(is_null_expr.evaluate(black_box(batch)).unwrap())); + }, + ); + + // IS NOT NULL check + let is_not_null_expr = Arc::new(IsNotNullExpr::new(make_col("c1", 0))); + + group.bench_with_input( + BenchmarkId::new("int64_is_not_null", format!("null_{}pct", null_pct)), + &batch, + |b, batch| { + b.iter(|| black_box(is_not_null_expr.evaluate(black_box(batch)).unwrap())); + }, + ); + } + + // Also benchmark string is_null + for null_pct in [0, 10, 50] { + let batch = create_string_batch(null_pct); + + let is_null_expr = Arc::new(IsNullExpr::new(make_col("c1", 0))); + + group.bench_with_input( + BenchmarkId::new("string_is_null", format!("null_{}pct", null_pct)), + &batch, + |b, batch| { + b.iter(|| black_box(is_null_expr.evaluate(black_box(batch)).unwrap())); + }, + ); + } + + group.finish(); +} + +fn bench_not_equal(c: &mut Criterion) { + let mut group = c.benchmark_group("not_equal"); + + for null_pct in [0, 10, 50] { + let batch = create_int64_batch(null_pct); + + // Not equal: c1 != c2 + let neq_expr = Arc::new(BinaryExpr::new( + make_col("c1", 0), + Operator::NotEq, + make_col("c2", 1), + )); + + group.bench_with_input( + BenchmarkId::new("int64_neq", format!("null_{}pct", null_pct)), + &batch, + |b, batch| { + b.iter(|| black_box(neq_expr.evaluate(black_box(batch)).unwrap())); + }, + ); + + // Null-safe not equal: c1 IS DISTINCT FROM c2 + let neq_null_safe_expr = Arc::new(BinaryExpr::new( + make_col("c1", 0), + Operator::IsDistinctFrom, + make_col("c2", 1), + )); + + group.bench_with_input( + BenchmarkId::new("int64_neq_null_safe", format!("null_{}pct", null_pct)), + &batch, + |b, batch| { + b.iter(|| black_box(neq_null_safe_expr.evaluate(black_box(batch)).unwrap())); + }, + ); + } + + group.finish(); +} + +criterion_group!( + benches, + bench_int64_equality, + bench_int64_less_than, + bench_string_equality, + bench_string_less_than, + bench_is_null, + bench_not_equal, +); +criterion_main!(benches); diff --git a/native/spark-expr/benches/normalize_nan.rs b/native/spark-expr/benches/normalize_nan.rs new file mode 100644 index 0000000000..17413e7f07 --- /dev/null +++ b/native/spark-expr/benches/normalize_nan.rs @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmarks for NormalizeNaNAndZero expression + +use arrow::array::Float64Array; +use arrow::datatypes::{DataType, Field, Schema}; +use arrow::record_batch::RecordBatch; +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +use datafusion::physical_expr::expressions::Column; +use datafusion::physical_expr::PhysicalExpr; +use datafusion_comet_spark_expr::NormalizeNaNAndZero; +use std::hint::black_box; +use std::sync::Arc; + +const BATCH_SIZE: usize = 8192; + +fn make_col(name: &str, index: usize) -> Arc { + Arc::new(Column::new(name, index)) +} + +/// Create a batch with float64 column containing various values including NaN and -0.0 +fn create_float_batch(nan_pct: usize, neg_zero_pct: usize, null_pct: usize) -> RecordBatch { + let mut values: Vec> = Vec::with_capacity(BATCH_SIZE); + + for i in 0..BATCH_SIZE { + if null_pct > 0 && i % (100 / null_pct.max(1)) == 0 { + values.push(None); + } else if nan_pct > 0 && i % (100 / nan_pct.max(1)) == 1 { + values.push(Some(f64::NAN)); + } else if neg_zero_pct > 0 && i % (100 / neg_zero_pct.max(1)) == 2 { + values.push(Some(-0.0)); + } else { + values.push(Some(i as f64 * 1.5)); + } + } + + let array = Float64Array::from(values); + let schema = Schema::new(vec![Field::new("c1", DataType::Float64, true)]); + + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)]).unwrap() +} + +fn bench_normalize_nan_and_zero(c: &mut Criterion) { + let mut group = c.benchmark_group("normalize_nan_and_zero"); + + // Test with different percentages of special values + let test_cases = [ + ("no_special", 0, 0, 0), + ("10pct_nan", 10, 0, 0), + ("10pct_neg_zero", 0, 10, 0), + ("10pct_null", 0, 0, 10), + ("mixed_10pct", 5, 5, 5), + ("all_normal", 0, 0, 0), + ]; + + for (name, nan_pct, neg_zero_pct, null_pct) in test_cases { + let batch = create_float_batch(nan_pct, neg_zero_pct, null_pct); + + let normalize_expr = Arc::new(NormalizeNaNAndZero::new( + DataType::Float64, + make_col("c1", 0), + )); + + group.bench_with_input(BenchmarkId::new("float64", name), &batch, |b, batch| { + b.iter(|| black_box(normalize_expr.evaluate(black_box(batch)).unwrap())); + }); + } + + group.finish(); +} + +criterion_group!(benches, bench_normalize_nan_and_zero); +criterion_main!(benches); diff --git a/native/spark-expr/src/math_funcs/internal/normalize_nan.rs b/native/spark-expr/src/math_funcs/internal/normalize_nan.rs index 0bd556ed73..4094bd7621 100644 --- a/native/spark-expr/src/math_funcs/internal/normalize_nan.rs +++ b/native/spark-expr/src/math_funcs/internal/normalize_nan.rs @@ -15,10 +15,11 @@ // specific language governing permissions and limitations // under the License. +use arrow::compute::unary; use arrow::datatypes::{DataType, Schema}; use arrow::{ - array::{as_primitive_array, ArrayAccessor, ArrayIter, Float32Array, Float64Array}, - datatypes::{ArrowNativeType, Float32Type, Float64Type}, + array::{as_primitive_array, Float32Array, Float64Array}, + datatypes::{Float32Type, Float64Type}, record_batch::RecordBatch, }; use datafusion::logical_expr::ColumnarValue; @@ -78,14 +79,16 @@ impl PhysicalExpr for NormalizeNaNAndZero { match &self.data_type { DataType::Float32 => { - let v = eval_typed(as_primitive_array::(&array)); - let new_array = Float32Array::from(v); - Ok(ColumnarValue::Array(Arc::new(new_array))) + let input = as_primitive_array::(&array); + // Use unary which operates directly on values buffer without intermediate allocation + let result: Float32Array = unary(input, normalize_float); + Ok(ColumnarValue::Array(Arc::new(result))) } DataType::Float64 => { - let v = eval_typed(as_primitive_array::(&array)); - let new_array = Float64Array::from(v); - Ok(ColumnarValue::Array(Arc::new(new_array))) + let input = as_primitive_array::(&array); + // Use unary which operates directly on values buffer without intermediate allocation + let result: Float64Array = unary(input, normalize_float); + Ok(ColumnarValue::Array(Arc::new(result))) } dt => panic!("Unexpected data type {dt:?}"), } @@ -106,20 +109,17 @@ impl PhysicalExpr for NormalizeNaNAndZero { } } -fn eval_typed>(input: T) -> Vec> { - let iter = ArrayIter::new(input); - iter.map(|o| { - o.map(|v| { - if v.is_nan() { - v.nan() - } else if v.is_neg_zero() { - v.zero() - } else { - v - } - }) - }) - .collect() +/// Normalize a floating point value by converting all NaN representations to a canonical NaN +/// and negative zero to positive zero. This is used for Spark's comparison semantics. +#[inline] +fn normalize_float(v: T) -> T { + if v.is_nan() { + T::nan() + } else if v == T::neg_zero() { + T::zero() + } else { + v + } } impl Display for NormalizeNaNAndZero { @@ -127,39 +127,3 @@ impl Display for NormalizeNaNAndZero { write!(f, "FloatNormalize [child: {}]", self.child) } } - -trait FloatDouble: ArrowNativeType { - fn is_nan(&self) -> bool; - fn nan(&self) -> Self; - fn is_neg_zero(&self) -> bool; - fn zero(&self) -> Self; -} - -impl FloatDouble for f32 { - fn is_nan(&self) -> bool { - f32::is_nan(*self) - } - fn nan(&self) -> Self { - f32::NAN - } - fn is_neg_zero(&self) -> bool { - *self == -0.0 - } - fn zero(&self) -> Self { - 0.0 - } -} -impl FloatDouble for f64 { - fn is_nan(&self) -> bool { - f64::is_nan(*self) - } - fn nan(&self) -> Self { - f64::NAN - } - fn is_neg_zero(&self) -> bool { - *self == -0.0 - } - fn zero(&self) -> Self { - 0.0 - } -} From 611940b923fa50df71d6e867cf1635f850bd80a7 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 27 Dec 2025 10:07:00 -0700 Subject: [PATCH 2/3] revert --- native/spark-expr/benches/comparison.rs | 337 ------------------------ 1 file changed, 337 deletions(-) delete mode 100644 native/spark-expr/benches/comparison.rs diff --git a/native/spark-expr/benches/comparison.rs b/native/spark-expr/benches/comparison.rs deleted file mode 100644 index b7b934d787..0000000000 --- a/native/spark-expr/benches/comparison.rs +++ /dev/null @@ -1,337 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Benchmarks for comparison expressions (eq, eq_null_safe, lt, is_null, etc.) - -use arrow::array::builder::{Int64Builder, StringBuilder}; -use arrow::datatypes::{DataType, Field, Schema}; -use arrow::record_batch::RecordBatch; -use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; -use datafusion::logical_expr::Operator; -use datafusion::physical_expr::expressions::{BinaryExpr, Column, IsNotNullExpr, IsNullExpr}; -use datafusion::physical_expr::PhysicalExpr; -use std::hint::black_box; -use std::sync::Arc; - -const BATCH_SIZE: usize = 8192; - -fn make_col(name: &str, index: usize) -> Arc { - Arc::new(Column::new(name, index)) -} - -/// Create a batch with two int64 columns with a specified null percentage -fn create_int64_batch(null_pct: usize) -> RecordBatch { - let mut c1 = Int64Builder::with_capacity(BATCH_SIZE); - let mut c2 = Int64Builder::with_capacity(BATCH_SIZE); - - for i in 0..BATCH_SIZE { - if null_pct > 0 && i % (100 / null_pct) == 0 { - c1.append_null(); - } else { - c1.append_value(i as i64); - } - if null_pct > 0 && (i + 1) % (100 / null_pct) == 0 { - c2.append_null(); - } else { - c2.append_value((i as i64) % 1000); - } - } - - let schema = Schema::new(vec![ - Field::new("c1", DataType::Int64, null_pct > 0), - Field::new("c2", DataType::Int64, null_pct > 0), - ]); - - RecordBatch::try_new( - Arc::new(schema), - vec![Arc::new(c1.finish()), Arc::new(c2.finish())], - ) - .unwrap() -} - -/// Create a batch with two string columns with a specified null percentage -fn create_string_batch(null_pct: usize) -> RecordBatch { - let mut c1 = StringBuilder::with_capacity(BATCH_SIZE, BATCH_SIZE * 20); - let mut c2 = StringBuilder::with_capacity(BATCH_SIZE, BATCH_SIZE * 20); - - for i in 0..BATCH_SIZE { - if null_pct > 0 && i % (100 / null_pct) == 0 { - c1.append_null(); - } else { - c1.append_value(format!("string value {}", i)); - } - if null_pct > 0 && (i + 1) % (100 / null_pct) == 0 { - c2.append_null(); - } else { - c2.append_value(format!("string value {}", i % 1000)); - } - } - - let schema = Schema::new(vec![ - Field::new("c1", DataType::Utf8, null_pct > 0), - Field::new("c2", DataType::Utf8, null_pct > 0), - ]); - - RecordBatch::try_new( - Arc::new(schema), - vec![Arc::new(c1.finish()), Arc::new(c2.finish())], - ) - .unwrap() -} - -fn bench_int64_equality(c: &mut Criterion) { - let mut group = c.benchmark_group("int64_equality"); - - for null_pct in [0, 10, 50] { - let batch = create_int64_batch(null_pct); - - // Regular equality: c1 = c2 - let eq_expr = Arc::new(BinaryExpr::new( - make_col("c1", 0), - Operator::Eq, - make_col("c2", 1), - )); - - group.bench_with_input( - BenchmarkId::new("eq", format!("null_{}pct", null_pct)), - &batch, - |b, batch| { - b.iter(|| black_box(eq_expr.evaluate(black_box(batch)).unwrap())); - }, - ); - - // Null-safe equality: c1 <=> c2 (IsNotDistinctFrom) - let eq_null_safe_expr = Arc::new(BinaryExpr::new( - make_col("c1", 0), - Operator::IsNotDistinctFrom, - make_col("c2", 1), - )); - - group.bench_with_input( - BenchmarkId::new("eq_null_safe", format!("null_{}pct", null_pct)), - &batch, - |b, batch| { - b.iter(|| black_box(eq_null_safe_expr.evaluate(black_box(batch)).unwrap())); - }, - ); - } - - group.finish(); -} - -fn bench_int64_less_than(c: &mut Criterion) { - let mut group = c.benchmark_group("int64_less_than"); - - for null_pct in [0, 10, 50] { - let batch = create_int64_batch(null_pct); - - // Less than: c1 < c2 - let lt_expr = Arc::new(BinaryExpr::new( - make_col("c1", 0), - Operator::Lt, - make_col("c2", 1), - )); - - group.bench_with_input( - BenchmarkId::new("lt", format!("null_{}pct", null_pct)), - &batch, - |b, batch| { - b.iter(|| black_box(lt_expr.evaluate(black_box(batch)).unwrap())); - }, - ); - - // Less than or equal: c1 <= c2 - let lteq_expr = Arc::new(BinaryExpr::new( - make_col("c1", 0), - Operator::LtEq, - make_col("c2", 1), - )); - - group.bench_with_input( - BenchmarkId::new("lt_eq", format!("null_{}pct", null_pct)), - &batch, - |b, batch| { - b.iter(|| black_box(lteq_expr.evaluate(black_box(batch)).unwrap())); - }, - ); - } - - group.finish(); -} - -fn bench_string_equality(c: &mut Criterion) { - let mut group = c.benchmark_group("string_equality"); - - for null_pct in [0, 10, 50] { - let batch = create_string_batch(null_pct); - - // Regular equality: c1 = c2 - let eq_expr = Arc::new(BinaryExpr::new( - make_col("c1", 0), - Operator::Eq, - make_col("c2", 1), - )); - - group.bench_with_input( - BenchmarkId::new("eq", format!("null_{}pct", null_pct)), - &batch, - |b, batch| { - b.iter(|| black_box(eq_expr.evaluate(black_box(batch)).unwrap())); - }, - ); - - // Null-safe equality: c1 <=> c2 (IsNotDistinctFrom) - let eq_null_safe_expr = Arc::new(BinaryExpr::new( - make_col("c1", 0), - Operator::IsNotDistinctFrom, - make_col("c2", 1), - )); - - group.bench_with_input( - BenchmarkId::new("eq_null_safe", format!("null_{}pct", null_pct)), - &batch, - |b, batch| { - b.iter(|| black_box(eq_null_safe_expr.evaluate(black_box(batch)).unwrap())); - }, - ); - } - - group.finish(); -} - -fn bench_string_less_than(c: &mut Criterion) { - let mut group = c.benchmark_group("string_less_than"); - - for null_pct in [0, 10, 50] { - let batch = create_string_batch(null_pct); - - // Less than: c1 < c2 - let lt_expr = Arc::new(BinaryExpr::new( - make_col("c1", 0), - Operator::Lt, - make_col("c2", 1), - )); - - group.bench_with_input( - BenchmarkId::new("lt", format!("null_{}pct", null_pct)), - &batch, - |b, batch| { - b.iter(|| black_box(lt_expr.evaluate(black_box(batch)).unwrap())); - }, - ); - } - - group.finish(); -} - -fn bench_is_null(c: &mut Criterion) { - let mut group = c.benchmark_group("is_null"); - - for null_pct in [0, 10, 50] { - let batch = create_int64_batch(null_pct); - - // IS NULL check - let is_null_expr = Arc::new(IsNullExpr::new(make_col("c1", 0))); - - group.bench_with_input( - BenchmarkId::new("int64_is_null", format!("null_{}pct", null_pct)), - &batch, - |b, batch| { - b.iter(|| black_box(is_null_expr.evaluate(black_box(batch)).unwrap())); - }, - ); - - // IS NOT NULL check - let is_not_null_expr = Arc::new(IsNotNullExpr::new(make_col("c1", 0))); - - group.bench_with_input( - BenchmarkId::new("int64_is_not_null", format!("null_{}pct", null_pct)), - &batch, - |b, batch| { - b.iter(|| black_box(is_not_null_expr.evaluate(black_box(batch)).unwrap())); - }, - ); - } - - // Also benchmark string is_null - for null_pct in [0, 10, 50] { - let batch = create_string_batch(null_pct); - - let is_null_expr = Arc::new(IsNullExpr::new(make_col("c1", 0))); - - group.bench_with_input( - BenchmarkId::new("string_is_null", format!("null_{}pct", null_pct)), - &batch, - |b, batch| { - b.iter(|| black_box(is_null_expr.evaluate(black_box(batch)).unwrap())); - }, - ); - } - - group.finish(); -} - -fn bench_not_equal(c: &mut Criterion) { - let mut group = c.benchmark_group("not_equal"); - - for null_pct in [0, 10, 50] { - let batch = create_int64_batch(null_pct); - - // Not equal: c1 != c2 - let neq_expr = Arc::new(BinaryExpr::new( - make_col("c1", 0), - Operator::NotEq, - make_col("c2", 1), - )); - - group.bench_with_input( - BenchmarkId::new("int64_neq", format!("null_{}pct", null_pct)), - &batch, - |b, batch| { - b.iter(|| black_box(neq_expr.evaluate(black_box(batch)).unwrap())); - }, - ); - - // Null-safe not equal: c1 IS DISTINCT FROM c2 - let neq_null_safe_expr = Arc::new(BinaryExpr::new( - make_col("c1", 0), - Operator::IsDistinctFrom, - make_col("c2", 1), - )); - - group.bench_with_input( - BenchmarkId::new("int64_neq_null_safe", format!("null_{}pct", null_pct)), - &batch, - |b, batch| { - b.iter(|| black_box(neq_null_safe_expr.evaluate(black_box(batch)).unwrap())); - }, - ); - } - - group.finish(); -} - -criterion_group!( - benches, - bench_int64_equality, - bench_int64_less_than, - bench_string_equality, - bench_string_less_than, - bench_is_null, - bench_not_equal, -); -criterion_main!(benches); From d774b1a7b97cc3bff0a99a79b02817db9d170913 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 27 Dec 2025 10:07:50 -0700 Subject: [PATCH 3/3] revert --- native/spark-expr/Cargo.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/native/spark-expr/Cargo.toml b/native/spark-expr/Cargo.toml index 634b3eedbf..b056e1b29a 100644 --- a/native/spark-expr/Cargo.toml +++ b/native/spark-expr/Cargo.toml @@ -80,10 +80,6 @@ harness = false name = "padding" harness = false -[[bench]] -name = "comparison" -harness = false - [[bench]] name = "normalize_nan" harness = false