From 01515c8ec776e06e1548e803f0fb1744f8b4527a Mon Sep 17 00:00:00 2001 From: Kanthi Subramanian Date: Wed, 15 Oct 2025 12:30:11 -0500 Subject: [PATCH 1/5] Added functionality similar to parquet-tools to print info of parquet files. --- .../main/java/com/altinity/ice/cli/Main.java | 56 +++++ .../ice/cli/internal/cmd/DescribeParquet.java | 233 ++++++++++++++++++ .../ice/cli/internal/iceberg/Sorting.java | 7 +- .../cli/internal/cmd/DescribeParquetTest.java | 89 +++++++ 4 files changed, 380 insertions(+), 5 deletions(-) create mode 100644 ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java create mode 100644 ice/src/test/java/com/altinity/ice/cli/internal/cmd/DescribeParquetTest.java diff --git a/ice/src/main/java/com/altinity/ice/cli/Main.java b/ice/src/main/java/com/altinity/ice/cli/Main.java index 3e22973..397b709 100644 --- a/ice/src/main/java/com/altinity/ice/cli/Main.java +++ b/ice/src/main/java/com/altinity/ice/cli/Main.java @@ -18,6 +18,7 @@ import com.altinity.ice.cli.internal.cmd.DeleteNamespace; import com.altinity.ice.cli.internal.cmd.DeleteTable; import com.altinity.ice.cli.internal.cmd.Describe; +import com.altinity.ice.cli.internal.cmd.DescribeParquet; import com.altinity.ice.cli.internal.cmd.Insert; import com.altinity.ice.cli.internal.cmd.InsertWatch; import com.altinity.ice.cli.internal.cmd.Scan; @@ -134,6 +135,61 @@ void describe( } } + @CommandLine.Command(name = "describe-parquet", description = "Describe parquet file metadata.") + void describeParquet( + @CommandLine.Parameters( + arity = "1", + paramLabel = "", + description = "Path to parquet file") + String target, + @CommandLine.Option( + names = {"-a", "--all"}, + description = "Show everything") + boolean showAll, + @CommandLine.Option( + names = {"-s", "--summary"}, + description = "Show size, rows, number of row groups, size, compress_size, etc.") + boolean showSummary, + @CommandLine.Option( + names = {"-c", "--columns"}, + description = "Show columns") + boolean showColumns, + @CommandLine.Option( + names = {"-r", "--row-groups"}, + description = "Show row groups") + boolean showRowGroups, + @CommandLine.Option( + names = {"-d", "--row-group-details"}, + description = "Show column stats within row group") + boolean showRowGroupDetails, + @CommandLine.Option( + names = {"--json"}, + description = "Output JSON instead of YAML") + boolean json) + throws IOException { + try (RESTCatalog catalog = loadCatalog()) { + var options = new java.util.ArrayList(); + if (showAll || showSummary) { + options.add(DescribeParquet.Option.SUMMARY); + } + if (showAll || showColumns) { + options.add(DescribeParquet.Option.COLUMNS); + } + if (showAll || showRowGroups) { + options.add(DescribeParquet.Option.ROW_GROUPS); + } + if (showAll || showRowGroupDetails) { + options.add(DescribeParquet.Option.ROW_GROUP_DETAILS); + } + + if (options.isEmpty()) { + options.add(DescribeParquet.Option.SUMMARY); + } + + DescribeParquet.run(catalog, target, json, options.toArray(new DescribeParquet.Option[0])); + } + } + public record IceSortOrder( @JsonProperty("column") String column, @JsonProperty("desc") boolean desc, diff --git a/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java b/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java new file mode 100644 index 0000000..3671438 --- /dev/null +++ b/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2025 Altinity Inc and/or its affiliates. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ +package com.altinity.ice.cli.internal.cmd; + +import com.altinity.ice.cli.internal.iceberg.io.Input; +import com.altinity.ice.cli.internal.iceberg.parquet.Metadata; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.rest.RESTCatalog; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; +import software.amazon.awssdk.utils.Lazy; + +public final class DescribeParquet { + + private DescribeParquet() {} + + public enum Option { + ALL, + SUMMARY, + COLUMNS, + ROW_GROUPS, + ROW_GROUP_DETAILS + } + + public static void run( + RESTCatalog catalog, + String filePath, + boolean json, + Option... options) + throws IOException { + + FileIO io = Input.newIO(filePath, null, new Lazy<>(() -> null)); + InputFile inputFile = Input.newFile(filePath, catalog, io); + ParquetMetadata metadata = Metadata.read(inputFile); + + ParquetInfo info = extractParquetInfo(metadata, options); + + ObjectMapper mapper = json ? new ObjectMapper() : new ObjectMapper(new YAMLFactory()); + mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); + String output = mapper.writeValueAsString(info); + System.out.println(output); + } + + private static ParquetInfo extractParquetInfo(ParquetMetadata metadata, Option... options) { + var optionsSet = java.util.Set.of(options); + boolean includeAll = optionsSet.contains(Option.ALL); + + FileMetaData fileMetadata = metadata.getFileMetaData(); + + // Summary info + Summary summary = null; + if (includeAll || optionsSet.contains(Option.SUMMARY)) { + long totalRows = metadata.getBlocks().stream() + .mapToLong(BlockMetaData::getRowCount) + .sum(); + + long compressedSize = metadata.getBlocks().stream() + .mapToLong(BlockMetaData::getCompressedSize) + .sum(); + + long uncompressedSize = metadata.getBlocks().stream() + .mapToLong(BlockMetaData::getTotalByteSize) + .sum(); + + summary = new Summary( + totalRows, + metadata.getBlocks().size(), + compressedSize, + uncompressedSize, + fileMetadata.getCreatedBy(), + fileMetadata.getSchema().getFieldCount() + ); + } + + // Column info + List columns = null; + if (includeAll || optionsSet.contains(Option.COLUMNS)) { + columns = extractColumns(fileMetadata.getSchema()); + } + + // Row group info + List rowGroups = null; + if (includeAll || optionsSet.contains(Option.ROW_GROUPS) || optionsSet.contains(Option.ROW_GROUP_DETAILS)) { + boolean includeDetails = includeAll || optionsSet.contains(Option.ROW_GROUP_DETAILS); + rowGroups = extractRowGroups(metadata.getBlocks(), includeDetails); + } + + return new ParquetInfo(summary, columns, rowGroups); + } + + private static List extractColumns(MessageType schema) { + List columns = new ArrayList<>(); + for (Type field : schema.getFields()) { + String logicalType = null; + if (field.isPrimitive()) { + var annotation = field.asPrimitiveType().getLogicalTypeAnnotation(); + logicalType = annotation != null ? annotation.toString() : null; + } + columns.add(new Column( + field.getName(), + field.isPrimitive() ? field.asPrimitiveType().getPrimitiveTypeName().name() : "GROUP", + field.getRepetition().name(), + logicalType + )); + } + return columns; + } + + private static List extractRowGroups(List blocks, boolean includeDetails) { + List rowGroups = new ArrayList<>(); + + for (int i = 0; i < blocks.size(); i++) { + BlockMetaData block = blocks.get(i); + + List columnChunks = null; + if (includeDetails) { + columnChunks = new ArrayList<>(); + for (ColumnChunkMetaData column : block.getColumns()) { + Statistics stats = column.getStatistics(); + + ColumnStats columnStats = null; + if (stats != null && !stats.isEmpty()) { + long nulls = stats.isNumNullsSet() ? stats.getNumNulls() : 0; + String min = null; + String max = null; + if (stats.hasNonNullValue()) { + Object minVal = stats.genericGetMin(); + Object maxVal = stats.genericGetMax(); + min = minVal != null ? minVal.toString() : null; + max = maxVal != null ? maxVal.toString() : null; + } + columnStats = new ColumnStats(nulls, min, max); + } + + columnChunks.add(new ColumnChunk( + column.getPath().toDotString(), + column.getPrimitiveType().getName(), + column.getEncodings().toString(), + column.getCodec().name(), + column.getTotalSize(), + column.getTotalUncompressedSize(), + column.getValueCount(), + columnStats + )); + } + } + + rowGroups.add(new RowGroup( + i, + block.getRowCount(), + block.getTotalByteSize(), + block.getCompressedSize(), + block.getStartingPos(), + columnChunks + )); + } + + return rowGroups; + } + + @JsonInclude(JsonInclude.Include.NON_NULL) + public record ParquetInfo( + Summary summary, + List columns, + List rowGroups + ) {} + + @JsonInclude(JsonInclude.Include.NON_NULL) + public record Summary( + long rows, + int rowGroups, + long compressedSize, + long uncompressedSize, + String createdBy, + int columnCount + ) {} + + @JsonInclude(JsonInclude.Include.NON_NULL) + public record Column( + String name, + String type, + String repetition, + String logicalType + ) {} + + @JsonInclude(JsonInclude.Include.NON_NULL) + public record RowGroup( + int index, + long rowCount, + long totalSize, + long compressedSize, + long startingPos, + List columns + ) {} + + @JsonInclude(JsonInclude.Include.NON_NULL) + public record ColumnChunk( + String path, + String type, + String encodings, + String codec, + long totalSize, + long uncompressedSize, + long valueCount, + ColumnStats stats + ) {} + + @JsonInclude(JsonInclude.Include.NON_NULL) + public record ColumnStats( + long nulls, + String min, + String max + ) {} +} \ No newline at end of file diff --git a/ice/src/main/java/com/altinity/ice/cli/internal/iceberg/Sorting.java b/ice/src/main/java/com/altinity/ice/cli/internal/iceberg/Sorting.java index c972752..63730f1 100644 --- a/ice/src/main/java/com/altinity/ice/cli/internal/iceberg/Sorting.java +++ b/ice/src/main/java/com/altinity/ice/cli/internal/iceberg/Sorting.java @@ -25,7 +25,6 @@ import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.mapping.NameMapping; import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.types.Types; @@ -100,16 +99,14 @@ public String toUnsortedDiffString() { } } - public static boolean isSorted( - InputFile inputFile, Schema tableSchema, SortOrder sortOrder) + public static boolean isSorted(InputFile inputFile, Schema tableSchema, SortOrder sortOrder) throws IOException { return checkSorted(inputFile, tableSchema, sortOrder).ok; } // TODO: check metadata first to avoid full scan when unsorted public static SortCheckResult checkSorted( - InputFile inputFile, Schema tableSchema, SortOrder sortOrder) - throws IOException { + InputFile inputFile, Schema tableSchema, SortOrder sortOrder) throws IOException { if (sortOrder.isUnsorted()) { return new SortCheckResult(false); } diff --git a/ice/src/test/java/com/altinity/ice/cli/internal/cmd/DescribeParquetTest.java b/ice/src/test/java/com/altinity/ice/cli/internal/cmd/DescribeParquetTest.java new file mode 100644 index 0000000..13d41a1 --- /dev/null +++ b/ice/src/test/java/com/altinity/ice/cli/internal/cmd/DescribeParquetTest.java @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2025 Altinity Inc and/or its affiliates. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ +package com.altinity.ice.cli.internal.cmd; + +import static org.assertj.core.api.Assertions.assertThat; + +import com.altinity.ice.test.Resource; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import org.apache.iceberg.inmemory.InMemoryCatalog; +import org.testng.annotations.Test; + +public class DescribeParquetTest { + + @Test + public void testDescribeParquetSummary() throws IOException { + ByteArrayOutputStream outContent = new ByteArrayOutputStream(); + PrintStream originalOut = System.out; + System.setOut(new PrintStream(outContent)); + + try { + InMemoryCatalog catalog = new InMemoryCatalog(); + catalog.initialize("test", java.util.Map.of()); + + String sampleFile = Resource.asInputFile("com/altinity/ice/cli/internal/iceberg/parquet/sample-001.parquet").location(); + + DescribeParquet.run((org.apache.iceberg.rest.RESTCatalog) null, sampleFile, false, DescribeParquet.Option.SUMMARY); + + String output = outContent.toString(); + + assertThat(output).contains("rows:"); + assertThat(output).contains("rowGroups:"); + assertThat(output).contains("compressedSize:"); + assertThat(output).contains("uncompressedSize:"); + } finally { + System.setOut(originalOut); + } + } + + @Test + public void testDescribeParquetColumns() throws IOException { + ByteArrayOutputStream outContent = new ByteArrayOutputStream(); + PrintStream originalOut = System.out; + System.setOut(new PrintStream(outContent)); + + try { + String sampleFile = Resource.asInputFile("com/altinity/ice/cli/internal/iceberg/parquet/sample-001.parquet").location(); + + DescribeParquet.run(null, sampleFile, false, DescribeParquet.Option.COLUMNS); + + String output = outContent.toString(); + + assertThat(output).contains("columns:"); + assertThat(output).contains("name:"); + assertThat(output).contains("type:"); + } finally { + System.setOut(originalOut); + } + } + + @Test + public void testDescribeParquetJson() throws IOException { + ByteArrayOutputStream outContent = new ByteArrayOutputStream(); + PrintStream originalOut = System.out; + System.setOut(new PrintStream(outContent)); + + try { + String sampleFile = Resource.asInputFile("com/altinity/ice/cli/internal/iceberg/parquet/sample-001.parquet").location(); + + DescribeParquet.run(null, sampleFile, true, DescribeParquet.Option.SUMMARY); + + String output = outContent.toString(); + + assertThat(output).contains("{"); + assertThat(output).contains("}"); + assertThat(output).contains("\"summary\""); + } finally { + System.setOut(originalOut); + } + } +} \ No newline at end of file From 967bad6fd069c94892bbf7c982f5796d34ecf878 Mon Sep 17 00:00:00 2001 From: Kanthi Subramanian Date: Wed, 15 Oct 2025 13:26:06 -0500 Subject: [PATCH 2/5] Fixed unit tests. --- ice/src/main/java/com/altinity/ice/cli/Main.java | 2 +- .../ice/cli/internal/cmd/DescribeParquet.java | 9 +++++++++ .../ice/cli/internal/cmd/DescribeParquetTest.java | 12 ++++++------ 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/ice/src/main/java/com/altinity/ice/cli/Main.java b/ice/src/main/java/com/altinity/ice/cli/Main.java index 397b709..5858493 100644 --- a/ice/src/main/java/com/altinity/ice/cli/Main.java +++ b/ice/src/main/java/com/altinity/ice/cli/Main.java @@ -151,7 +151,7 @@ void describeParquet( description = "Show size, rows, number of row groups, size, compress_size, etc.") boolean showSummary, @CommandLine.Option( - names = {"-c", "--columns"}, + names = {"--columns"}, description = "Show columns") boolean showColumns, @CommandLine.Option( diff --git a/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java b/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java index 3671438..db2331e 100644 --- a/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java +++ b/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java @@ -50,6 +50,15 @@ public static void run( FileIO io = Input.newIO(filePath, null, new Lazy<>(() -> null)); InputFile inputFile = Input.newFile(filePath, catalog, io); + run(inputFile, json, options); + } + + public static void run( + InputFile inputFile, + boolean json, + Option... options) + throws IOException { + ParquetMetadata metadata = Metadata.read(inputFile); ParquetInfo info = extractParquetInfo(metadata, options); diff --git a/ice/src/test/java/com/altinity/ice/cli/internal/cmd/DescribeParquetTest.java b/ice/src/test/java/com/altinity/ice/cli/internal/cmd/DescribeParquetTest.java index 13d41a1..125cc34 100644 --- a/ice/src/test/java/com/altinity/ice/cli/internal/cmd/DescribeParquetTest.java +++ b/ice/src/test/java/com/altinity/ice/cli/internal/cmd/DescribeParquetTest.java @@ -30,9 +30,9 @@ public void testDescribeParquetSummary() throws IOException { InMemoryCatalog catalog = new InMemoryCatalog(); catalog.initialize("test", java.util.Map.of()); - String sampleFile = Resource.asInputFile("com/altinity/ice/cli/internal/iceberg/parquet/sample-001.parquet").location(); + var sampleFile = Resource.asInputFile("com/altinity/ice/cli/internal/iceberg/parquet/sample-001.parquet"); - DescribeParquet.run((org.apache.iceberg.rest.RESTCatalog) null, sampleFile, false, DescribeParquet.Option.SUMMARY); + DescribeParquet.run(sampleFile, false, DescribeParquet.Option.SUMMARY); String output = outContent.toString(); @@ -52,9 +52,9 @@ public void testDescribeParquetColumns() throws IOException { System.setOut(new PrintStream(outContent)); try { - String sampleFile = Resource.asInputFile("com/altinity/ice/cli/internal/iceberg/parquet/sample-001.parquet").location(); + var sampleFile = Resource.asInputFile("com/altinity/ice/cli/internal/iceberg/parquet/sample-001.parquet"); - DescribeParquet.run(null, sampleFile, false, DescribeParquet.Option.COLUMNS); + DescribeParquet.run(sampleFile, false, DescribeParquet.Option.COLUMNS); String output = outContent.toString(); @@ -73,9 +73,9 @@ public void testDescribeParquetJson() throws IOException { System.setOut(new PrintStream(outContent)); try { - String sampleFile = Resource.asInputFile("com/altinity/ice/cli/internal/iceberg/parquet/sample-001.parquet").location(); + var sampleFile = Resource.asInputFile("com/altinity/ice/cli/internal/iceberg/parquet/sample-001.parquet"); - DescribeParquet.run(null, sampleFile, true, DescribeParquet.Option.SUMMARY); + DescribeParquet.run(sampleFile, true, DescribeParquet.Option.SUMMARY); String output = outContent.toString(); From 6dd8278d88480053b0e01da023ac87dc71bb2b66 Mon Sep 17 00:00:00 2001 From: Kanthi Subramanian Date: Sun, 26 Oct 2025 08:05:00 -0500 Subject: [PATCH 3/5] Formatting changes. --- .../ice/cli/internal/cmd/DescribeParquet.java | 163 ++++++++---------- .../cli/internal/cmd/DescribeParquetTest.java | 41 +++-- 2 files changed, 91 insertions(+), 113 deletions(-) diff --git a/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java b/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java index db2331e..4b5a714 100644 --- a/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java +++ b/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java @@ -41,28 +41,20 @@ public enum Option { ROW_GROUP_DETAILS } - public static void run( - RESTCatalog catalog, - String filePath, - boolean json, - Option... options) + public static void run(RESTCatalog catalog, String filePath, boolean json, Option... options) throws IOException { - + FileIO io = Input.newIO(filePath, null, new Lazy<>(() -> null)); InputFile inputFile = Input.newFile(filePath, catalog, io); run(inputFile, json, options); } - public static void run( - InputFile inputFile, - boolean json, - Option... options) - throws IOException { - + public static void run(InputFile inputFile, boolean json, Option... options) throws IOException { + ParquetMetadata metadata = Metadata.read(inputFile); - + ParquetInfo info = extractParquetInfo(metadata, options); - + ObjectMapper mapper = json ? new ObjectMapper() : new ObjectMapper(new YAMLFactory()); mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); String output = mapper.writeValueAsString(info); @@ -72,47 +64,45 @@ public static void run( private static ParquetInfo extractParquetInfo(ParquetMetadata metadata, Option... options) { var optionsSet = java.util.Set.of(options); boolean includeAll = optionsSet.contains(Option.ALL); - + FileMetaData fileMetadata = metadata.getFileMetaData(); - + // Summary info Summary summary = null; if (includeAll || optionsSet.contains(Option.SUMMARY)) { - long totalRows = metadata.getBlocks().stream() - .mapToLong(BlockMetaData::getRowCount) - .sum(); - - long compressedSize = metadata.getBlocks().stream() - .mapToLong(BlockMetaData::getCompressedSize) - .sum(); - - long uncompressedSize = metadata.getBlocks().stream() - .mapToLong(BlockMetaData::getTotalByteSize) - .sum(); - - summary = new Summary( - totalRows, - metadata.getBlocks().size(), - compressedSize, - uncompressedSize, - fileMetadata.getCreatedBy(), - fileMetadata.getSchema().getFieldCount() - ); + long totalRows = metadata.getBlocks().stream().mapToLong(BlockMetaData::getRowCount).sum(); + + long compressedSize = + metadata.getBlocks().stream().mapToLong(BlockMetaData::getCompressedSize).sum(); + + long uncompressedSize = + metadata.getBlocks().stream().mapToLong(BlockMetaData::getTotalByteSize).sum(); + + summary = + new Summary( + totalRows, + metadata.getBlocks().size(), + compressedSize, + uncompressedSize, + fileMetadata.getCreatedBy(), + fileMetadata.getSchema().getFieldCount()); } - + // Column info List columns = null; if (includeAll || optionsSet.contains(Option.COLUMNS)) { columns = extractColumns(fileMetadata.getSchema()); } - + // Row group info List rowGroups = null; - if (includeAll || optionsSet.contains(Option.ROW_GROUPS) || optionsSet.contains(Option.ROW_GROUP_DETAILS)) { + if (includeAll + || optionsSet.contains(Option.ROW_GROUPS) + || optionsSet.contains(Option.ROW_GROUP_DETAILS)) { boolean includeDetails = includeAll || optionsSet.contains(Option.ROW_GROUP_DETAILS); rowGroups = extractRowGroups(metadata.getBlocks(), includeDetails); } - + return new ParquetInfo(summary, columns, rowGroups); } @@ -124,28 +114,29 @@ private static List extractColumns(MessageType schema) { var annotation = field.asPrimitiveType().getLogicalTypeAnnotation(); logicalType = annotation != null ? annotation.toString() : null; } - columns.add(new Column( - field.getName(), - field.isPrimitive() ? field.asPrimitiveType().getPrimitiveTypeName().name() : "GROUP", - field.getRepetition().name(), - logicalType - )); + columns.add( + new Column( + field.getName(), + field.isPrimitive() ? field.asPrimitiveType().getPrimitiveTypeName().name() : "GROUP", + field.getRepetition().name(), + logicalType)); } return columns; } - private static List extractRowGroups(List blocks, boolean includeDetails) { + private static List extractRowGroups( + List blocks, boolean includeDetails) { List rowGroups = new ArrayList<>(); - + for (int i = 0; i < blocks.size(); i++) { BlockMetaData block = blocks.get(i); - + List columnChunks = null; if (includeDetails) { columnChunks = new ArrayList<>(); for (ColumnChunkMetaData column : block.getColumns()) { Statistics stats = column.getStatistics(); - + ColumnStats columnStats = null; if (stats != null && !stats.isEmpty()) { long nulls = stats.isNumNullsSet() ? stats.getNumNulls() : 0; @@ -159,39 +150,35 @@ private static List extractRowGroups(List blocks, boole } columnStats = new ColumnStats(nulls, min, max); } - - columnChunks.add(new ColumnChunk( - column.getPath().toDotString(), - column.getPrimitiveType().getName(), - column.getEncodings().toString(), - column.getCodec().name(), - column.getTotalSize(), - column.getTotalUncompressedSize(), - column.getValueCount(), - columnStats - )); + + columnChunks.add( + new ColumnChunk( + column.getPath().toDotString(), + column.getPrimitiveType().getName(), + column.getEncodings().toString(), + column.getCodec().name(), + column.getTotalSize(), + column.getTotalUncompressedSize(), + column.getValueCount(), + columnStats)); } } - - rowGroups.add(new RowGroup( - i, - block.getRowCount(), - block.getTotalByteSize(), - block.getCompressedSize(), - block.getStartingPos(), - columnChunks - )); + + rowGroups.add( + new RowGroup( + i, + block.getRowCount(), + block.getTotalByteSize(), + block.getCompressedSize(), + block.getStartingPos(), + columnChunks)); } - + return rowGroups; } @JsonInclude(JsonInclude.Include.NON_NULL) - public record ParquetInfo( - Summary summary, - List columns, - List rowGroups - ) {} + public record ParquetInfo(Summary summary, List columns, List rowGroups) {} @JsonInclude(JsonInclude.Include.NON_NULL) public record Summary( @@ -200,16 +187,10 @@ public record Summary( long compressedSize, long uncompressedSize, String createdBy, - int columnCount - ) {} + int columnCount) {} @JsonInclude(JsonInclude.Include.NON_NULL) - public record Column( - String name, - String type, - String repetition, - String logicalType - ) {} + public record Column(String name, String type, String repetition, String logicalType) {} @JsonInclude(JsonInclude.Include.NON_NULL) public record RowGroup( @@ -218,8 +199,7 @@ public record RowGroup( long totalSize, long compressedSize, long startingPos, - List columns - ) {} + List columns) {} @JsonInclude(JsonInclude.Include.NON_NULL) public record ColumnChunk( @@ -230,13 +210,8 @@ public record ColumnChunk( long totalSize, long uncompressedSize, long valueCount, - ColumnStats stats - ) {} + ColumnStats stats) {} @JsonInclude(JsonInclude.Include.NON_NULL) - public record ColumnStats( - long nulls, - String min, - String max - ) {} -} \ No newline at end of file + public record ColumnStats(long nulls, String min, String max) {} +} diff --git a/ice/src/test/java/com/altinity/ice/cli/internal/cmd/DescribeParquetTest.java b/ice/src/test/java/com/altinity/ice/cli/internal/cmd/DescribeParquetTest.java index 125cc34..a58d8aa 100644 --- a/ice/src/test/java/com/altinity/ice/cli/internal/cmd/DescribeParquetTest.java +++ b/ice/src/test/java/com/altinity/ice/cli/internal/cmd/DescribeParquetTest.java @@ -25,17 +25,18 @@ public void testDescribeParquetSummary() throws IOException { ByteArrayOutputStream outContent = new ByteArrayOutputStream(); PrintStream originalOut = System.out; System.setOut(new PrintStream(outContent)); - + try { InMemoryCatalog catalog = new InMemoryCatalog(); catalog.initialize("test", java.util.Map.of()); - - var sampleFile = Resource.asInputFile("com/altinity/ice/cli/internal/iceberg/parquet/sample-001.parquet"); - + + var sampleFile = + Resource.asInputFile("com/altinity/ice/cli/internal/iceberg/parquet/sample-001.parquet"); + DescribeParquet.run(sampleFile, false, DescribeParquet.Option.SUMMARY); - + String output = outContent.toString(); - + assertThat(output).contains("rows:"); assertThat(output).contains("rowGroups:"); assertThat(output).contains("compressedSize:"); @@ -50,14 +51,15 @@ public void testDescribeParquetColumns() throws IOException { ByteArrayOutputStream outContent = new ByteArrayOutputStream(); PrintStream originalOut = System.out; System.setOut(new PrintStream(outContent)); - + try { - var sampleFile = Resource.asInputFile("com/altinity/ice/cli/internal/iceberg/parquet/sample-001.parquet"); - + var sampleFile = + Resource.asInputFile("com/altinity/ice/cli/internal/iceberg/parquet/sample-001.parquet"); + DescribeParquet.run(sampleFile, false, DescribeParquet.Option.COLUMNS); - + String output = outContent.toString(); - + assertThat(output).contains("columns:"); assertThat(output).contains("name:"); assertThat(output).contains("type:"); @@ -65,20 +67,21 @@ public void testDescribeParquetColumns() throws IOException { System.setOut(originalOut); } } - - @Test + + @Test public void testDescribeParquetJson() throws IOException { ByteArrayOutputStream outContent = new ByteArrayOutputStream(); PrintStream originalOut = System.out; System.setOut(new PrintStream(outContent)); - + try { - var sampleFile = Resource.asInputFile("com/altinity/ice/cli/internal/iceberg/parquet/sample-001.parquet"); - + var sampleFile = + Resource.asInputFile("com/altinity/ice/cli/internal/iceberg/parquet/sample-001.parquet"); + DescribeParquet.run(sampleFile, true, DescribeParquet.Option.SUMMARY); - + String output = outContent.toString(); - + assertThat(output).contains("{"); assertThat(output).contains("}"); assertThat(output).contains("\"summary\""); @@ -86,4 +89,4 @@ public void testDescribeParquetJson() throws IOException { System.setOut(originalOut); } } -} \ No newline at end of file +} From ce3c2f00d3cdc6e4f77b604269c6a7aa4586135b Mon Sep 17 00:00:00 2001 From: Kanthi Subramanian Date: Wed, 29 Oct 2025 14:23:42 -0500 Subject: [PATCH 4/5] Pass s3 region and no-sign-request options to show parquet metadata option. --- ice/src/main/java/com/altinity/ice/cli/Main.java | 11 +++++++++-- .../ice/cli/internal/cmd/DescribeParquet.java | 11 +++++++++-- .../altinity/ice/cli/internal/iceberg/io/Input.java | 5 ++++- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/ice/src/main/java/com/altinity/ice/cli/Main.java b/ice/src/main/java/com/altinity/ice/cli/Main.java index b233ee0..43d3900 100644 --- a/ice/src/main/java/com/altinity/ice/cli/Main.java +++ b/ice/src/main/java/com/altinity/ice/cli/Main.java @@ -165,8 +165,14 @@ void describeParquet( @CommandLine.Option( names = {"--json"}, description = "Output JSON instead of YAML") - boolean json) + boolean json, + @CommandLine.Option(names = {"--s3-region"}) String s3Region, + @CommandLine.Option( + names = {"--s3-no-sign-request"}, + description = "Access S3 files without authentication") + boolean s3NoSignRequest) throws IOException { + setAWSRegion(s3Region); try (RESTCatalog catalog = loadCatalog()) { var options = new java.util.ArrayList(); if (showAll || showSummary) { @@ -186,7 +192,8 @@ void describeParquet( options.add(DescribeParquet.Option.SUMMARY); } - DescribeParquet.run(catalog, target, json, options.toArray(new DescribeParquet.Option[0])); + DescribeParquet.run( + catalog, target, json, s3NoSignRequest, options.toArray(new DescribeParquet.Option[0])); } } diff --git a/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java b/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java index 4b5a714..88a2543 100644 --- a/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java +++ b/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java @@ -41,10 +41,17 @@ public enum Option { ROW_GROUP_DETAILS } - public static void run(RESTCatalog catalog, String filePath, boolean json, Option... options) + public static void run( + RESTCatalog catalog, + String filePath, + boolean json, + boolean s3NoSignRequest, + Option... options) throws IOException { - FileIO io = Input.newIO(filePath, null, new Lazy<>(() -> null)); + Lazy s3ClientLazy = + new Lazy<>(() -> com.altinity.ice.cli.internal.s3.S3.newClient(s3NoSignRequest)); + FileIO io = Input.newIO(filePath, null, s3ClientLazy); InputFile inputFile = Input.newFile(filePath, catalog, io); run(inputFile, json, options); } diff --git a/ice/src/main/java/com/altinity/ice/cli/internal/iceberg/io/Input.java b/ice/src/main/java/com/altinity/ice/cli/internal/iceberg/io/Input.java index 927a639..cd99444 100644 --- a/ice/src/main/java/com/altinity/ice/cli/internal/iceberg/io/Input.java +++ b/ice/src/main/java/com/altinity/ice/cli/internal/iceberg/io/Input.java @@ -89,8 +89,11 @@ case String s when (s.startsWith("http:") || s.startsWith("https:")) -> { createParentDirs(dst.toFile()); String tempName = name + "~"; Path tmp = Paths.get(httpCachePath, tempName); + // Clean up any existing temp file from previous interrupted runs + if (Files.exists(tmp)) { + Files.delete(tmp); + } try (InputStream in = URI.create(s).toURL().openStream()) { - // FIXME: race with another copy Files.copy(in, tmp); } Files.move(tmp, dst); From be1d9f09884752d27259d669ee3614d8874a9318 Mon Sep 17 00:00:00 2001 From: Kanthi Subramanian Date: Thu, 6 Nov 2025 20:36:51 -0600 Subject: [PATCH 5/5] use S3CrossRegionSyncClient --- .../com/altinity/ice/cli/internal/cmd/DescribeParquet.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java b/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java index 88a2543..83abf7a 100644 --- a/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java +++ b/ice/src/main/java/com/altinity/ice/cli/internal/cmd/DescribeParquet.java @@ -27,6 +27,7 @@ import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.Type; +import software.amazon.awssdk.services.s3.internal.crossregion.S3CrossRegionSyncClient; import software.amazon.awssdk.utils.Lazy; public final class DescribeParquet { @@ -50,7 +51,10 @@ public static void run( throws IOException { Lazy s3ClientLazy = - new Lazy<>(() -> com.altinity.ice.cli.internal.s3.S3.newClient(s3NoSignRequest)); + new Lazy<>( + () -> + new S3CrossRegionSyncClient( + com.altinity.ice.cli.internal.s3.S3.newClient(s3NoSignRequest))); FileIO io = Input.newIO(filePath, null, s3ClientLazy); InputFile inputFile = Input.newFile(filePath, catalog, io); run(inputFile, json, options);