From 4d5ba128a5234e24e522e6fdb5a4a283d28076c8 Mon Sep 17 00:00:00 2001 From: "chenweiguo.vc" Date: Wed, 21 May 2025 01:15:00 +0800 Subject: [PATCH 1/7] GH-52: Make RangeEqualsVisitor of RunEndEncodedVector more efficient --- .../vector/compare/RangeEqualsVisitor.java | 28 ++++------- .../vector/complex/RunEndEncodedVector.java | 50 +++++++++++++++++++ 2 files changed, 59 insertions(+), 19 deletions(-) diff --git a/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java index abcf312c5e..03559b667e 100644 --- a/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java +++ b/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java @@ -43,6 +43,7 @@ import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.NonNullableStructVector; import org.apache.arrow.vector.complex.RunEndEncodedVector; +import org.apache.arrow.vector.complex.RunEndEncodedVector.RangeIterator; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.complex.UnionVector; @@ -270,35 +271,24 @@ protected boolean compareRunEndEncodedVectors(Range range) { RunEndEncodedVector leftVector = (RunEndEncodedVector) left; RunEndEncodedVector rightVector = (RunEndEncodedVector) right; - final int leftRangeEnd = range.getLeftStart() + range.getLength(); - final int rightRangeEnd = range.getRightStart() + range.getLength(); + final RunEndEncodedVector.RangeIterator leftIterator = + new RangeIterator(leftVector, range.getLeftStart(), range.getLength()); + final RunEndEncodedVector.RangeIterator rightIterator = + new RangeIterator(rightVector, range.getRightStart(), range.getLength()); FieldVector leftValuesVector = leftVector.getValuesVector(); FieldVector rightValuesVector = rightVector.getValuesVector(); RangeEqualsVisitor innerVisitor = createInnerVisitor(leftValuesVector, rightValuesVector, null); - int leftLogicalIndex = range.getLeftStart(); - int rightLogicalIndex = range.getRightStart(); + while (leftIterator.nextRun() | rightIterator.nextRun()) { + int leftPhysicalIndex = leftIterator.getRunIndex(); + int rightPhysicalIndex = rightIterator.getRunIndex(); - while (leftLogicalIndex < leftRangeEnd) { - // TODO: implement it more efficient - // https://github.com/apache/arrow/issues/44157 - int leftPhysicalIndex = leftVector.getPhysicalIndex(leftLogicalIndex); - int rightPhysicalIndex = rightVector.getPhysicalIndex(rightLogicalIndex); if (leftValuesVector.accept( innerVisitor, new Range(leftPhysicalIndex, rightPhysicalIndex, 1))) { - int leftRunEnd = leftVector.getRunEnd(leftLogicalIndex); - int rightRunEnd = rightVector.getRunEnd(rightLogicalIndex); - - int leftRunLength = Math.min(leftRunEnd, leftRangeEnd) - leftLogicalIndex; - int rightRunLength = Math.min(rightRunEnd, rightRangeEnd) - rightLogicalIndex; - - if (leftRunLength != rightRunLength) { + if (leftIterator.getRunLength() != rightIterator.getRunLength()) { return false; - } else { - leftLogicalIndex = leftRunEnd; - rightLogicalIndex = rightRunEnd; } } else { return false; diff --git a/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java b/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java index 1bb9a3d6c0..c2231fbbd9 100644 --- a/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java +++ b/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java @@ -820,4 +820,54 @@ static int getPhysicalIndex(FieldVector runEndVector, int logicalIndex) { return result; } + + public static class RangeIterator { + + private final RunEndEncodedVector runEndEncodedVector; + private final int rangeEnd; + private int runIndex; + private int runEnd; + private int logicalPos; + + public RangeIterator(RunEndEncodedVector runEndEncodedVector, int startIndex, int length) { + this.runEndEncodedVector = runEndEncodedVector; + this.rangeEnd = startIndex + length; + this.runIndex = runEndEncodedVector.getPhysicalIndex(startIndex) - 1; + this.runEnd = startIndex; + this.logicalPos = -1; + } + + public boolean nextRun() { + logicalPos = runEnd; + if (logicalPos >= rangeEnd) { + return false; + } + updateRun(); + return true; + } + + private void updateRun() { + runIndex++; + runEnd = (int) ((BaseIntVector) runEndEncodedVector.runEndsVector).getValueAsLong(runIndex); + } + + public boolean nextValue() { + logicalPos++; + if (logicalPos >= rangeEnd) { + return false; + } + if (logicalPos == runEnd) { + updateRun(); + } + return true; + } + + public int getRunIndex() { + return runIndex; + } + + public int getRunLength() { + return Math.min(runEnd, rangeEnd) - logicalPos; + } + } } From 741ab758673b12c39536de17bdb77efc077c9480 Mon Sep 17 00:00:00 2001 From: "chenweiguo.vc" Date: Wed, 21 May 2025 15:29:10 +0800 Subject: [PATCH 2/7] GH-52: Make RangeEqualsVisitor of RunEndEncodedVector more efficient --- .../vector/compare/RangeEqualsVisitor.java | 15 +++++++++++---- .../vector/complex/RunEndEncodedVector.java | 16 +++++++++++++++- .../arrow/vector/TestRunEndEncodedVector.java | 18 ++++++++++++------ 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java index 03559b667e..ead5590784 100644 --- a/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java +++ b/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java @@ -272,16 +272,17 @@ protected boolean compareRunEndEncodedVectors(Range range) { RunEndEncodedVector rightVector = (RunEndEncodedVector) right; final RunEndEncodedVector.RangeIterator leftIterator = - new RangeIterator(leftVector, range.getLeftStart(), range.getLength()); + new RunEndEncodedVector.RangeIterator(leftVector, range.getLeftStart(), range.getLength()); final RunEndEncodedVector.RangeIterator rightIterator = - new RangeIterator(rightVector, range.getRightStart(), range.getLength()); + new RunEndEncodedVector.RangeIterator( + rightVector, range.getRightStart(), range.getLength()); FieldVector leftValuesVector = leftVector.getValuesVector(); FieldVector rightValuesVector = rightVector.getValuesVector(); RangeEqualsVisitor innerVisitor = createInnerVisitor(leftValuesVector, rightValuesVector, null); - while (leftIterator.nextRun() | rightIterator.nextRun()) { + while (nextRun(leftIterator, rightIterator)) { int leftPhysicalIndex = leftIterator.getRunIndex(); int rightPhysicalIndex = rightIterator.getRunIndex(); @@ -295,7 +296,13 @@ innerVisitor, new Range(leftPhysicalIndex, rightPhysicalIndex, 1))) { } } - return true; + return leftIterator.isEnd() && rightIterator.isEnd(); + } + + private static boolean nextRun(RangeIterator leftIterator, RangeIterator rightIterator) { + boolean left = leftIterator.nextRun(); + boolean right = rightIterator.nextRun(); + return left && right; } protected RangeEqualsVisitor createInnerVisitor( diff --git a/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java b/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java index c2231fbbd9..031632ede9 100644 --- a/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java +++ b/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java @@ -28,6 +28,7 @@ import org.apache.arrow.memory.OutOfMemoryException; import org.apache.arrow.memory.util.ByteFunctionHelpers; import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.BaseIntVector; import org.apache.arrow.vector.BaseValueVector; import org.apache.arrow.vector.BigIntVector; @@ -830,8 +831,17 @@ public static class RangeIterator { private int logicalPos; public RangeIterator(RunEndEncodedVector runEndEncodedVector, int startIndex, int length) { + int rangeEnd = startIndex + length; + Preconditions.checkArgument( + startIndex >= 0, "startIndex %s must be non negative.", startIndex); + Preconditions.checkArgument( + rangeEnd <= runEndEncodedVector.getValueCount(), + "(startIndex + length) %s out of range[0, %s].", + rangeEnd, + runEndEncodedVector.getValueCount()); + + this.rangeEnd = rangeEnd; this.runEndEncodedVector = runEndEncodedVector; - this.rangeEnd = startIndex + length; this.runIndex = runEndEncodedVector.getPhysicalIndex(startIndex) - 1; this.runEnd = startIndex; this.logicalPos = -1; @@ -869,5 +879,9 @@ public int getRunIndex() { public int getRunLength() { return Math.min(runEnd, rangeEnd) - logicalPos; } + + public boolean isEnd() { + return logicalPos >= rangeEnd; + } } } diff --git a/vector/src/test/java/org/apache/arrow/vector/TestRunEndEncodedVector.java b/vector/src/test/java/org/apache/arrow/vector/TestRunEndEncodedVector.java index adf51c0730..9fa153e928 100644 --- a/vector/src/test/java/org/apache/arrow/vector/TestRunEndEncodedVector.java +++ b/vector/src/test/java/org/apache/arrow/vector/TestRunEndEncodedVector.java @@ -148,12 +148,18 @@ public void testRangeCompare() { assertTrue( constantVector.accept( new RangeEqualsVisitor(constantVector, constantVector), new Range(1, 2, 13))); - assertFalse( - constantVector.accept( - new RangeEqualsVisitor(constantVector, constantVector), new Range(1, 10, 10))); - assertFalse( - constantVector.accept( - new RangeEqualsVisitor(constantVector, constantVector), new Range(10, 1, 10))); + + // throws exception if the range end is out the bound of the vector + assertThrows( + IllegalArgumentException.class, + () -> + constantVector.accept( + new RangeEqualsVisitor(constantVector, constantVector), new Range(1, 10, 10))); + assertThrows( + IllegalArgumentException.class, + () -> + constantVector.accept( + new RangeEqualsVisitor(constantVector, constantVector), new Range(10, 1, 10))); // Create REE vector representing: [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5]. RunEndEncodedVector reeVector = From 767e5ab04722c4004821785df5cd4e1766506096 Mon Sep 17 00:00:00 2001 From: "chenweiguo.vc" Date: Wed, 21 May 2025 22:36:42 +0800 Subject: [PATCH 3/7] check length first --- .../org/apache/arrow/vector/compare/RangeEqualsVisitor.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java index ead5590784..78a48dc906 100644 --- a/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java +++ b/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java @@ -286,12 +286,8 @@ protected boolean compareRunEndEncodedVectors(Range range) { int leftPhysicalIndex = leftIterator.getRunIndex(); int rightPhysicalIndex = rightIterator.getRunIndex(); - if (leftValuesVector.accept( + if (leftIterator.getRunLength() != rightIterator.getRunLength() || !leftValuesVector.accept( innerVisitor, new Range(leftPhysicalIndex, rightPhysicalIndex, 1))) { - if (leftIterator.getRunLength() != rightIterator.getRunLength()) { - return false; - } - } else { return false; } } From 304fe6f46a232b7edee663367446fe2f1f891d81 Mon Sep 17 00:00:00 2001 From: "chenweiguo.vc" Date: Wed, 21 May 2025 23:34:26 +0800 Subject: [PATCH 4/7] format --- .../org/apache/arrow/vector/compare/RangeEqualsVisitor.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java index 78a48dc906..bc2e3a6aab 100644 --- a/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java +++ b/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java @@ -286,8 +286,9 @@ protected boolean compareRunEndEncodedVectors(Range range) { int leftPhysicalIndex = leftIterator.getRunIndex(); int rightPhysicalIndex = rightIterator.getRunIndex(); - if (leftIterator.getRunLength() != rightIterator.getRunLength() || !leftValuesVector.accept( - innerVisitor, new Range(leftPhysicalIndex, rightPhysicalIndex, 1))) { + if (leftIterator.getRunLength() != rightIterator.getRunLength() + || !leftValuesVector.accept( + innerVisitor, new Range(leftPhysicalIndex, rightPhysicalIndex, 1))) { return false; } } From 5266e4de5e69e4310f0830bc55a4b1d59deabbe5 Mon Sep 17 00:00:00 2001 From: "chenweiguo.vc" Date: Wed, 21 May 2025 23:35:52 +0800 Subject: [PATCH 5/7] TestRangeEqualsVisitor --- .../compare/TestRangeEqualsVisitor.java | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java b/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java index 08da786eb2..9624734356 100644 --- a/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java +++ b/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java @@ -22,6 +22,7 @@ import java.nio.charset.Charset; import java.util.Arrays; +import java.util.List; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.BigIntVector; @@ -39,6 +40,7 @@ import org.apache.arrow.vector.complex.LargeListViewVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.ListViewVector; +import org.apache.arrow.vector.complex.RunEndEncodedVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.complex.impl.NullableStructWriter; @@ -53,7 +55,9 @@ import org.apache.arrow.vector.holders.NullableUInt4Holder; import org.apache.arrow.vector.types.FloatingPointPrecision; import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ArrowType.RunEndEncoded; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.junit.jupiter.api.AfterEach; @@ -1003,6 +1007,54 @@ public void testLargeListViewVectorApproxEquals() { } } + @Test + public void testRunEndEncodedFloat8ApproxEquals() { + try (final Float8Vector vector1 = new Float8Vector("float", allocator); + final Float8Vector vector2 = new Float8Vector("float", allocator); + final Float8Vector vector3 = new Float8Vector("float", allocator); + final IntVector reeVector = new IntVector("ree", allocator)) { + + final float epsilon = 1.0E-6f; + setVector(vector1, 1.1, 2.2); + setVector(vector2, 1.1 + epsilon / 2, 2.2 + epsilon / 2); + setVector(vector3, 1.1 + epsilon * 2, 2.2 + epsilon * 2); + setVector(reeVector, 1, 3); + + ArrowType type = MinorType.FLOAT8.getType(); + final FieldType valueType = FieldType.notNullable(type); + final FieldType runEndType = FieldType.notNullable(MinorType.INT.getType()); + + final Field valueField = new Field("value", valueType, null); + final Field runEndField = new Field("ree", runEndType, null); + + Field field = + new Field( + "ree_float", + FieldType.notNullable(RunEndEncoded.INSTANCE), + List.of(runEndField, valueField)); + + try (final RunEndEncodedVector encodedVector1 = + new RunEndEncodedVector(field, allocator, reeVector, vector1, null); + final RunEndEncodedVector encodedVector2 = + new RunEndEncodedVector(field, allocator, reeVector, vector2, null); + final RunEndEncodedVector encodedVector3 = + new RunEndEncodedVector(field, allocator, reeVector, vector3, null)) { + + encodedVector1.setValueCount(3); + encodedVector2.setValueCount(3); + encodedVector3.setValueCount(3); + + Range range = new Range(0, 0, encodedVector1.getValueCount()); + assertTrue( + new ApproxEqualsVisitor(encodedVector1, encodedVector2, epsilon, epsilon) + .rangeEquals(range)); + assertFalse( + new ApproxEqualsVisitor(encodedVector1, encodedVector3, epsilon, epsilon) + .rangeEquals(range)); + } + } + } + private void writeStructVector(NullableStructWriter writer, int value1, long value2) { writer.start(); writer.integer("f0").writeInt(value1); From ebdf7ba3dd0574da82188edfbe0e16184ac12b93 Mon Sep 17 00:00:00 2001 From: "chenweiguo.vc" Date: Thu, 22 May 2025 13:50:42 +0800 Subject: [PATCH 6/7] java doc --- .../vector/complex/RunEndEncodedVector.java | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java b/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java index 031632ede9..1fe7749872 100644 --- a/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java +++ b/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java @@ -830,6 +830,14 @@ public static class RangeIterator { private int runEnd; private int logicalPos; + /** + * Constructs a new RangeIterator for iterating over a range of values in a RunEndEncodedVector. + * + * @param runEndEncodedVector The vector to iterate over + * @param startIndex The logical start index of the range (inclusive) + * @param length The number of values to include in the range + * @throws IllegalArgumentException if startIndex is negative or (startIndex + length) exceeds vector bounds + */ public RangeIterator(RunEndEncodedVector runEndEncodedVector, int startIndex, int length) { int rangeEnd = startIndex + length; Preconditions.checkArgument( @@ -847,6 +855,11 @@ public RangeIterator(RunEndEncodedVector runEndEncodedVector, int startIndex, in this.logicalPos = -1; } + /** + * Advances to the next run in the range. + * + * @return true if there is another run available, false if iteration has completed + */ public boolean nextRun() { logicalPos = runEnd; if (logicalPos >= rangeEnd) { @@ -861,6 +874,11 @@ private void updateRun() { runEnd = (int) ((BaseIntVector) runEndEncodedVector.runEndsVector).getValueAsLong(runIndex); } + /** + * Advances to the next value in the range. + * + * @return true if there is another value available, false if iteration has completed + */ public boolean nextValue() { logicalPos++; if (logicalPos >= rangeEnd) { @@ -872,14 +890,29 @@ public boolean nextValue() { return true; } + /** + * Gets the current run index (physical position in the run-ends vector). + * + * @return the current run index + */ public int getRunIndex() { return runIndex; } + /** + * Gets the length of the current run within the iterator's range. + * + * @return the number of remaining values in current run within the iterator's range + */ public int getRunLength() { return Math.min(runEnd, rangeEnd) - logicalPos; } + /** + * Checks if iteration has completed. + * + * @return true if all values in the range have been processed, false otherwise + */ public boolean isEnd() { return logicalPos >= rangeEnd; } From e1dd4cbe3ab1faa26418b3e24005b35620b86caf Mon Sep 17 00:00:00 2001 From: "chenweiguo.vc" Date: Thu, 22 May 2025 15:27:33 +0800 Subject: [PATCH 7/7] format --- .../org/apache/arrow/vector/complex/RunEndEncodedVector.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java b/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java index 1fe7749872..b83e13449a 100644 --- a/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java +++ b/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java @@ -836,7 +836,8 @@ public static class RangeIterator { * @param runEndEncodedVector The vector to iterate over * @param startIndex The logical start index of the range (inclusive) * @param length The number of values to include in the range - * @throws IllegalArgumentException if startIndex is negative or (startIndex + length) exceeds vector bounds + * @throws IllegalArgumentException if startIndex is negative or (startIndex + length) exceeds + * vector bounds */ public RangeIterator(RunEndEncodedVector runEndEncodedVector, int startIndex, int length) { int rangeEnd = startIndex + length;