From 5470deb20c02f2da0635e234b48951d6d3be0a07 Mon Sep 17 00:00:00 2001 From: ViggoC Date: Tue, 10 Dec 2024 23:45:49 +0800 Subject: [PATCH 1/3] Implement VectorAppender for BaseVariableWidthViewVector --- .../arrow/vector/util/VectorAppender.java | 62 +++++++++++++- .../org/apache/arrow/vector/TestUtils.java | 10 +++ .../vector/TestVariableWidthViewVector.java | 41 ++++------ .../testing/ValueVectorDataPopulator.java | 13 +++ .../arrow/vector/util/TestVectorAppender.java | 80 +++++++++++++++++++ 5 files changed, 178 insertions(+), 28 deletions(-) diff --git a/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java b/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java index e703571b37..ffa5c6e42d 100644 --- a/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java +++ b/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java @@ -19,6 +19,8 @@ import static org.apache.arrow.memory.util.LargeMemoryUtil.checkedCastToInt; import java.util.HashSet; +import java.util.List; +import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.memory.util.MemoryUtil; import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.BaseFixedWidthVector; @@ -91,7 +93,6 @@ public ValueVector visit(BaseFixedWidthVector deltaVector, Void value) { deltaVector.getDataBuffer(), deltaVector.getValueCount(), targetVector.getDataBuffer()); - } else { MemoryUtil.copyMemory( deltaVector.getDataBuffer().memoryAddress(), @@ -247,8 +248,63 @@ public ValueVector visit(BaseLargeVariableWidthVector deltaVector, Void value) { } @Override - public ValueVector visit(BaseVariableWidthViewVector left, Void value) { - throw new UnsupportedOperationException("View vectors are not supported."); + public ValueVector visit(BaseVariableWidthViewVector deltaVector, Void value) { + Preconditions.checkArgument( + typeVisitor.equals(deltaVector), + "The targetVector to append must have the same type as the targetVector being appended"); + + if (deltaVector.getValueCount() == 0) { + return targetVector; // nothing to append, return + } + + int oldTargetValueCount = targetVector.getValueCount(); + int newValueCount = oldTargetValueCount + deltaVector.getValueCount(); + + // make sure there is enough capacity + while (targetVector.getValueCapacity() < newValueCount) { + targetVector.reAlloc(); + } + + // append validity buffer + BitVectorHelper.concatBits( + targetVector.getValidityBuffer(), + oldTargetValueCount, + deltaVector.getValidityBuffer(), + deltaVector.getValueCount(), + targetVector.getValidityBuffer()); + + // append data buffers + BaseVariableWidthViewVector targetViewVector = (BaseVariableWidthViewVector) targetVector; + List targetDataBuffers = targetViewVector.getDataBuffers(); + final int oldTargetDataBufferCount = targetDataBuffers.size(); + List deltaVectorDataBuffers = deltaVector.getDataBuffers(); + deltaVectorDataBuffers.forEach(buf -> buf.getReferenceManager().retain()); + targetDataBuffers.addAll(deltaVectorDataBuffers); + + // append view buffer + ArrowBuf targetViewBuffer = targetVector.getDataBuffer(); + MemoryUtil.copyMemory( + deltaVector.getDataBuffer().memoryAddress(), + targetViewBuffer.memoryAddress() + + (long) BaseVariableWidthViewVector.ELEMENT_SIZE * oldTargetValueCount, + (long) BaseVariableWidthViewVector.ELEMENT_SIZE * deltaVector.getValueCount()); + + // update view buffer + for (int i = oldTargetValueCount; i < newValueCount; i++) { + if (targetViewVector.isSet(i) > 0 + && targetViewVector.getValueLength(i) > BaseVariableWidthViewVector.INLINE_SIZE) { + long start = + (long) i * BaseVariableWidthViewVector.ELEMENT_SIZE + + BaseVariableWidthViewVector.LENGTH_WIDTH + + BaseVariableWidthViewVector.PREFIX_WIDTH; + // shift buf id + int bufferId = targetViewBuffer.getInt(start); + targetViewBuffer.setInt(start, bufferId + oldTargetDataBufferCount); + } + } + + targetVector.setValueCount(newValueCount); + return targetVector; } @Override diff --git a/vector/src/test/java/org/apache/arrow/vector/TestUtils.java b/vector/src/test/java/org/apache/arrow/vector/TestUtils.java index 3845652ad0..82295f8037 100644 --- a/vector/src/test/java/org/apache/arrow/vector/TestUtils.java +++ b/vector/src/test/java/org/apache/arrow/vector/TestUtils.java @@ -16,6 +16,7 @@ */ package org.apache.arrow.vector; +import java.util.Random; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType; @@ -52,4 +53,13 @@ public static T newVector( Class c, String name, MinorType type, BufferAllocator allocator) { return c.cast(FieldType.nullable(type.getType()).createNewSingleVector(name, allocator, null)); } + + public static String generateRandomString(int length) { + Random random = new Random(); + StringBuilder sb = new StringBuilder(length); + for (int i = 0; i < length; i++) { + sb.append(random.nextInt(10)); // 0-9 + } + return sb.toString(); + } } diff --git a/vector/src/test/java/org/apache/arrow/vector/TestVariableWidthViewVector.java b/vector/src/test/java/org/apache/arrow/vector/TestVariableWidthViewVector.java index a4533dba3b..7a3a1bae63 100644 --- a/vector/src/test/java/org/apache/arrow/vector/TestVariableWidthViewVector.java +++ b/vector/src/test/java/org/apache/arrow/vector/TestVariableWidthViewVector.java @@ -160,7 +160,7 @@ public void testDataBufferBasedAllocationInSameBuffer() { try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) { viewVarCharVector.allocateNew(48, 4); final int valueCount = 4; - String str4 = generateRandomString(34); + String str4 = TestUtils.generateRandomString(34); viewVarCharVector.set(0, STR1); viewVarCharVector.set(1, STR2); viewVarCharVector.set(2, STR3); @@ -216,7 +216,7 @@ public void testDataBufferBasedAllocationInOtherBuffer() { try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) { viewVarCharVector.allocateNew(48, 4); final int valueCount = 4; - String str4 = generateRandomString(35); + String str4 = TestUtils.generateRandomString(35); viewVarCharVector.set(0, STR1); viewVarCharVector.set(1, STR2); viewVarCharVector.set(2, STR3); @@ -271,7 +271,7 @@ public void testDataBufferBasedAllocationInOtherBuffer() { public void testSetSafe() { try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) { viewVarCharVector.allocateNew(1, 1); - byte[] str6 = generateRandomString(40).getBytes(); + byte[] str6 = TestUtils.generateRandomString(40).getBytes(); final List strings = List.of(STR0, STR1, STR2, STR3, STR4, STR5, str6); // set data to a position out of capacity index @@ -305,8 +305,8 @@ public void testMixedAllocation() { try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) { viewVarCharVector.allocateNew(128, 6); final int valueCount = 6; - String str4 = generateRandomString(35); - String str6 = generateRandomString(40); + String str4 = TestUtils.generateRandomString(35); + String str6 = TestUtils.generateRandomString(40); viewVarCharVector.set(0, STR1); viewVarCharVector.set(1, STR2); viewVarCharVector.set(2, STR3); @@ -405,7 +405,7 @@ public void testSetNullableViewVarCharHolder() { setAndCheck(viewVarCharVector, i, strings.get(size - i - 1), stringHolder); } - String longString = generateRandomString(128); + String longString = TestUtils.generateRandomString(128); setAndCheck(viewVarCharVector, 6, longString.getBytes(), stringHolder); } } @@ -441,7 +441,7 @@ public void testSetNullableViewVarBinaryHolder() { setAndCheck(viewVarBinaryVector, i, strings.get(size - i - 1), holder); } - String longString = generateRandomString(128); + String longString = TestUtils.generateRandomString(128); setAndCheck(viewVarBinaryVector, 6, longString.getBytes(), holder); } } @@ -1169,7 +1169,7 @@ public void testOverwriteShortFromLongString() { vector.setValueCount(5); // overwrite index 2 with a long string - String longString = generateRandomString(128); + String longString = TestUtils.generateRandomString(128); byte[] longStringBytes = longString.getBytes(StandardCharsets.UTF_8); // since the append-only approach is used and the remaining capacity // is not enough to store the new string; a new buffer will be allocated. @@ -1373,7 +1373,7 @@ public void testOverwriteLongFromALongerLongString() { // since a new buffer is added to the dataBuffers final ArrowBuf currentDataBuf = vector.dataBuffers.get(0); final long remainingCapacity = currentDataBuf.capacity() - currentDataBuf.writerIndex(); - String longerString = generateRandomString(35); + String longerString = TestUtils.generateRandomString(35); byte[] longerStringBytes = longerString.getBytes(StandardCharsets.UTF_8); assertTrue(remainingCapacity < longerStringBytes.length); @@ -1406,7 +1406,7 @@ public void testOverwriteLongFromALongerLongString() { // the remaining capacity is enough to store in the same data buffer final ArrowBuf currentDataBuf = vector.dataBuffers.get(0); final long remainingCapacity = currentDataBuf.capacity() - currentDataBuf.writerIndex(); - String longerString = generateRandomString(24); + String longerString = TestUtils.generateRandomString(24); byte[] longerStringBytes = longerString.getBytes(StandardCharsets.UTF_8); assertTrue(remainingCapacity > longerStringBytes.length); @@ -1505,7 +1505,7 @@ public void testSafeOverwriteShortFromLongString() { vector.setValueCount(5); // overwrite index 2 with a long string - String longString = generateRandomString(128); + String longString = TestUtils.generateRandomString(128); byte[] longStringBytes = longString.getBytes(StandardCharsets.UTF_8); vector.setSafe(2, longStringBytes); @@ -1671,7 +1671,7 @@ public void testSafeOverwriteLongFromALongerLongString() { vector.setSafe(2, STR7); vector.setValueCount(3); - String longerString = generateRandomString(35); + String longerString = TestUtils.generateRandomString(35); byte[] longerStringBytes = longerString.getBytes(StandardCharsets.UTF_8); vector.setSafe(1, longerStringBytes); @@ -1697,7 +1697,7 @@ public void testSafeOverwriteLongFromALongerLongString() { vector.setSafe(4, STR6); vector.setValueCount(5); - String longerString = generateRandomString(24); + String longerString = TestUtils.generateRandomString(24); byte[] longerStringBytes = longerString.getBytes(StandardCharsets.UTF_8); vector.setSafe(2, longerStringBytes); @@ -1869,7 +1869,7 @@ public void testCopyFromWithNulls( // to avoid re-allocation. This is to test copyFrom() without re-allocation. final int numberOfValues = initialCapacity / 2 / ViewVarCharVector.ELEMENT_SIZE; - final String prefixString = generateRandomString(12); + final String prefixString = TestUtils.generateRandomString(12); for (int i = 0; i < numberOfValues; i++) { if (i % 3 == 0) { @@ -1965,7 +1965,7 @@ public void testCopyFromSafeWithNulls( final int numberOfValues = initialCapacity / ViewVarCharVector.ELEMENT_SIZE; - final String prefixString = generateRandomString(12); + final String prefixString = TestUtils.generateRandomString(12); for (int i = 0; i < numberOfValues; i++) { if (i % 3 == 0) { @@ -2746,7 +2746,7 @@ private void testSplitAndTransferWithMultipleDataBuffersHelper( */ @Test public void testSplitAndTransferWithMultipleDataBuffers() { - final String str4 = generateRandomString(35); + final String str4 = TestUtils.generateRandomString(35); final byte[][] data = new byte[][] {STR1, STR2, STR3, str4.getBytes(StandardCharsets.UTF_8)}; final int startIndex = 1; final int length = 3; @@ -2851,13 +2851,4 @@ public void testVectorLoadUnloadOnMixedTypes() { } } } - - private String generateRandomString(int length) { - Random random = new Random(); - StringBuilder sb = new StringBuilder(length); - for (int i = 0; i < length; i++) { - sb.append(random.nextInt(10)); // 0-9 - } - return sb.toString(); - } } diff --git a/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java b/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java index f599dfa539..849fe6d667 100644 --- a/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java +++ b/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java @@ -60,6 +60,7 @@ import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VariableWidthFieldVector; +import org.apache.arrow.vector.ViewVarCharVector; import org.apache.arrow.vector.complex.BaseLargeRepeatedValueViewVector; import org.apache.arrow.vector.complex.BaseRepeatedValueVector; import org.apache.arrow.vector.complex.BaseRepeatedValueViewVector; @@ -606,6 +607,18 @@ public static void setVector(VarCharVector vector, String... values) { vector.setValueCount(length); } + /** Populate values for ViewVarCharVector. */ + public static void setVector(ViewVarCharVector vector, String... values) { + final int length = values.length; + vector.allocateNewSafe(); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.setSafe(i, values[i].getBytes(StandardCharsets.UTF_8)); + } + } + vector.setValueCount(length); + } + /** Populate values for LargeVarCharVector. */ public static void setVector(LargeVarCharVector vector, String... values) { final int length = values.length; diff --git a/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java b/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java index 19eafd1b20..6b720e9ff7 100644 --- a/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java +++ b/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java @@ -24,16 +24,21 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.stream.IntStream; +import java.util.stream.Stream; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.Float4Vector; import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.LargeVarCharVector; +import org.apache.arrow.vector.TestUtils; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.ViewVarCharVector; import org.apache.arrow.vector.compare.Range; import org.apache.arrow.vector.compare.RangeEqualsVisitor; import org.apache.arrow.vector.compare.TypeEqualsVisitor; @@ -171,6 +176,81 @@ public void testAppendVariableWidthVector() { } } + @Test + public void testAppendVariableWidthViewVector() { + final int length1 = 10; + final int length2 = 5; + try (ViewVarCharVector target = new ViewVarCharVector("", allocator); + ViewVarCharVector delta = new ViewVarCharVector("", allocator)) { + target.allocateNew(5, length1); + delta.allocateNew(5, length2); + + ValueVectorDataPopulator.setVector( + target, "a0", "a1", "a2", "a3", null, "a5", "a6", "a7", "a8", "a9"); + ValueVectorDataPopulator.setVector(delta, "a10", "a11", "a12", "a13", null); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + try (ViewVarCharVector expected = new ViewVarCharVector("expected", allocator)) { + expected.allocateNew(); + ValueVectorDataPopulator.setVector( + expected, "a0", "a1", "a2", "a3", null, "a5", "a6", "a7", "a8", "a9", "a10", "a11", + "a12", "a13", null); + assertVectorsEqual(expected, target); + } + } + } + + @Test + public void testAppendEmptyVariableWidthViewVector() { + try (ViewVarCharVector target = new ViewVarCharVector("", allocator); + ViewVarCharVector delta = new ViewVarCharVector("", allocator)) { + ValueVectorDataPopulator.setVector( + target, "a0", "a1", "a2", "a3", null, "a5", "a6", "a7", "a8", "a9"); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + try (ViewVarCharVector expected = new ViewVarCharVector("expected", allocator)) { + ValueVectorDataPopulator.setVector( + expected, "a0", "a1", "a2", "a3", null, "a5", "a6", "a7", "a8", "a9"); + assertVectorsEqual(expected, target); + } + } + } + + @Test + public void testAppendShortLongVariableWidthViewVector() { + try (ViewVarCharVector target = new ViewVarCharVector("", allocator); + ViewVarCharVector delta = new ViewVarCharVector("", allocator)) { + String[] targetValues = + IntStream.range(-5, 5) + .mapToObj( + i -> TestUtils.generateRandomString(BaseVariableWidthViewVector.INLINE_SIZE + i)) + .toArray(String[]::new); + ValueVectorDataPopulator.setVector(target, targetValues); + + String[] deltaValues = + IntStream.range(-3, 3) + .mapToObj( + i -> TestUtils.generateRandomString(BaseVariableWidthViewVector.INLINE_SIZE + i)) + .toArray(String[]::new); + ValueVectorDataPopulator.setVector(delta, deltaValues); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + try (ViewVarCharVector expected = new ViewVarCharVector("expected", allocator)) { + ValueVectorDataPopulator.setVector( + expected, + Stream.concat(Arrays.stream(targetValues), Arrays.stream(deltaValues)) + .toArray(String[]::new)); + assertVectorsEqual(expected, target); + } + } + } + @Test public void testAppendEmptyVariableWidthVector() { try (VarCharVector target = new VarCharVector("", allocator); From 319d45fccb271dff37c6ee378d2c736841b81aac Mon Sep 17 00:00:00 2001 From: ViggoC Date: Sun, 19 Jan 2025 20:44:47 +0800 Subject: [PATCH 2/3] assert data buffers size --- .../java/org/apache/arrow/vector/util/TestVectorAppender.java | 1 + 1 file changed, 1 insertion(+) diff --git a/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java b/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java index 6b720e9ff7..99dd3ab987 100644 --- a/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java +++ b/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java @@ -241,6 +241,7 @@ public void testAppendShortLongVariableWidthViewVector() { VectorAppender appender = new VectorAppender(target); delta.accept(appender, null); + assertEquals(2, target.getDataBuffers().size()); try (ViewVarCharVector expected = new ViewVarCharVector("expected", allocator)) { ValueVectorDataPopulator.setVector( expected, From e918540fd137d38e7a00d0e21fde4ce68da7e713 Mon Sep 17 00:00:00 2001 From: ViggoC Date: Mon, 20 Jan 2025 22:39:24 +0800 Subject: [PATCH 3/3] add testAppendLongVariableWidthViewVector --- .../arrow/vector/util/VectorAppender.java | 5 ++- .../arrow/vector/util/TestVectorAppender.java | 36 +++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java b/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java index ffa5c6e42d..0dc96a4d4b 100644 --- a/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java +++ b/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java @@ -262,7 +262,10 @@ public ValueVector visit(BaseVariableWidthViewVector deltaVector, Void value) { // make sure there is enough capacity while (targetVector.getValueCapacity() < newValueCount) { - targetVector.reAlloc(); + // Do not call BaseVariableWidthViewVector#reAlloc() here, + // because reallocViewDataBuffer() is always unnecessary + ((BaseVariableWidthViewVector) targetVector).reallocValidityBuffer(); + ((BaseVariableWidthViewVector) targetVector).reallocViewBuffer(); } // append validity buffer diff --git a/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java b/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java index 99dd3ab987..e1b3889d85 100644 --- a/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java +++ b/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java @@ -252,6 +252,42 @@ public void testAppendShortLongVariableWidthViewVector() { } } + @Test + public void testAppendLongVariableWidthViewVector() { + try (ViewVarCharVector target = new ViewVarCharVector("", allocator); + ViewVarCharVector delta = new ViewVarCharVector("", allocator)) { + + String[] targetValues = randomLongViewVarCharVector(target); + String[] deltaValues = randomLongViewVarCharVector(delta); + + VectorAppender appender = new VectorAppender(target); + delta.accept(appender, null); + + assertEquals(4, target.getDataBuffers().size()); + try (ViewVarCharVector expected = new ViewVarCharVector("expected", allocator)) { + ValueVectorDataPopulator.setVector( + expected, + Stream.concat(Arrays.stream(targetValues), Arrays.stream(deltaValues)) + .toArray(String[]::new)); + assertVectorsEqual(expected, target); + } + } + } + + private static String[] randomLongViewVarCharVector(ViewVarCharVector target) { + assertEquals(0, target.getDataBuffers().size()); + int initial = 64; + int stringCount = 128; + target.setInitialCapacity(initial); + String[] targetValues = + IntStream.range(0, stringCount) + .mapToObj(i -> TestUtils.generateRandomString(BaseVariableWidthViewVector.ELEMENT_SIZE)) + .toArray(String[]::new); + ValueVectorDataPopulator.setVector(target, targetValues); + assertEquals(2, target.getDataBuffers().size()); + return targetValues; + } + @Test public void testAppendEmptyVariableWidthVector() { try (VarCharVector target = new VarCharVector("", allocator);