From 9b69e5b1c95f42a20d88388a0d093d17ce1bcfe6 Mon Sep 17 00:00:00 2001 From: Lucca Di Benedetto Date: Mon, 1 Dec 2025 13:26:02 +0100 Subject: [PATCH 1/4] test data generator for HDF5 Readers --- .../functions/io/hdf5/gen_HDF5_testdata.R | 225 ++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R diff --git a/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R b/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R new file mode 100644 index 00000000000..22e94da1d41 --- /dev/null +++ b/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R @@ -0,0 +1,225 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +# Generate various HDF5 test files with different formats. +# Creates test files in the 'in' directory. + +if (!require("rhdf5", quietly = TRUE)) { + cat("Error: rhdf5 is not installed.\n") + quit(status = 1) +} + +SMALL_MATRIX_2D <- c(200, 40) +LARGE_MATRIX_2D <- c(800, 30) +SMALL_MATRIX_3D <- c(15, 15, 5) + +VECTOR_LENGTH <- 200 +STRING_ARRAY_LENGTH <- 30 + +CHUNK_SHAPE <- c(100, 20) +LARGE_CHUNK_SHAPE <- c(200, 30) + +TENSOR_DIMS <- list( + samples = 120, + height = 16, + width = 16, + channels_a = 4, + channels_b = 5, + label_features = 12 +) + +write_matrix <- function(file_path, dataset_name, shape, generator = function(n) rnorm(n)) { + values <- generator(prod(shape)) + h5write(array(values, dim = shape), file_path, dataset_name) +} + +generate_test_file_single_dataset <- function(dir) { + file_path <- file.path(dir, "test_single_dataset.h5") + h5createFile(file_path) + write_matrix(file_path, "data", SMALL_MATRIX_2D) + cat("Created test_single_dataset.h5 (single 2D dataset)\n") +} + +generate_test_file_multiple_datasets <- function(dir) { + file_path <- file.path(dir, "test_multiple_datasets.h5") + h5createFile(file_path) + write_matrix(file_path, "matrix_2d", SMALL_MATRIX_2D) + h5write(rnorm(VECTOR_LENGTH), file_path, "vector_1d") + write_matrix(file_path, "matrix_3d", SMALL_MATRIX_3D) + cat("Created test_multiple_datasets.h5 (1D/2D/3D datasets)\n") +} + +generate_test_file_different_dtypes <- function(dir) { + file_path <- file.path(dir, "test_different_dtypes.h5") + h5createFile(file_path) + write_matrix(file_path, "double_primary", SMALL_MATRIX_2D) + write_matrix(file_path, "double_secondary", SMALL_MATRIX_2D) + write_matrix( + file_path, + "int32", + SMALL_MATRIX_2D, + generator = function(n) as.integer(sample(-100:100, n, replace = TRUE)) + ) + write_matrix( + file_path, + "int32_alt", + SMALL_MATRIX_2D, + generator = function(n) as.integer(sample(-100:100, n, replace = TRUE)) + ) + cat("Created test_different_dtypes.h5 (double/int datasets)\n") +} + +# https://support.hdfgroup.org/documentation/hdf5-docs/advanced_topics/chunking_in_hdf5.html +generate_test_file_chunked <- function(dir) { + file_path <- file.path(dir, "test_chunked.h5") + h5createFile(file_path) + + data <- array(rnorm(prod(SMALL_MATRIX_2D)), dim = SMALL_MATRIX_2D) + h5createDataset(file_path, "chunked_data", dims = SMALL_MATRIX_2D, chunk = CHUNK_SHAPE) + h5write(data, file_path, "chunked_data") + + write_matrix(file_path, "non_chunked_data", SMALL_MATRIX_2D) + cat("Created test_chunked.h5 (chunked dataset)\n") +} + +generate_test_file_compressed <- function(dir) { + file_path <- file.path(dir, "test_compressed.h5") + h5createFile(file_path) + data <- array(rnorm(prod(SMALL_MATRIX_2D)), dim = SMALL_MATRIX_2D) + h5createDataset(file_path, "gzip_compressed_9", dims = SMALL_MATRIX_2D, + chunk = SMALL_MATRIX_2D, level = 9) + h5write(data, file_path, "gzip_compressed_9") + h5createDataset(file_path, "gzip_compressed_1", dims = SMALL_MATRIX_2D, + chunk = SMALL_MATRIX_2D, level = 1) + h5write(data, file_path, "gzip_compressed_1") + cat("Created test_compressed.h5 (gzip compression)\n") +} + +generate_test_file_multi_tensor_samples <- function(dir) { + file_path <- file.path(dir, "test_multi_tensor_samples.h5") + h5createFile(file_path) + write_matrix( + file_path, + "sen1", + c(TENSOR_DIMS$samples, TENSOR_DIMS$height, TENSOR_DIMS$width, TENSOR_DIMS$channels_a) + ) + write_matrix( + file_path, + "sen2", + c(TENSOR_DIMS$samples, TENSOR_DIMS$height, TENSOR_DIMS$width, TENSOR_DIMS$channels_b) + ) + write_matrix( + file_path, + "label", + c(TENSOR_DIMS$samples, TENSOR_DIMS$label_features), + generator = function(n) as.integer(sample(0:1, n, replace = TRUE)) + ) + cat("Created test_multi_tensor_samples.h5 (multi-input tensors)\n") +} + +generate_test_file_nested_groups <- function(dir) { + file_path <- file.path(dir, "test_nested_groups.h5") + h5createFile(file_path) + write_matrix(file_path, "root_data", SMALL_MATRIX_2D) + h5createGroup(file_path, "group1") + write_matrix(file_path, "group1/data1", SMALL_MATRIX_2D) + h5createGroup(file_path, "group1/subgroup") + write_matrix(file_path, "group1/subgroup/data2", SMALL_MATRIX_2D) + cat("Created test_nested_groups.h5 (nested group hierarchy)\n") +} + +generate_test_file_with_attributes <- function(dir) { + file_path <- file.path(dir, "test_with_attributes.h5") + h5createFile(file_path) + write_matrix(file_path, "data", SMALL_MATRIX_2D) + + fid <- H5Fopen(file_path) + did <- H5Dopen(fid, "data") + h5writeAttribute("Test dataset with attributes", did, "description") + h5writeAttribute(1.0, did, "version") + h5writeAttribute(SMALL_MATRIX_2D, did, "shape") + H5Dclose(did) + + h5writeAttribute("2025-11-26", fid, "file_created") + h5writeAttribute("attributes", fid, "test_type") + H5Fclose(fid) + cat("Created test_with_attributes.h5 (dataset + file attributes)\n") +} + +generate_test_file_empty_datasets <- function(dir) { + file_path <- file.path(dir, "test_empty_datasets.h5") + h5createFile(file_path) + h5createDataset(file_path, "empty", dims = c(0, SMALL_MATRIX_2D[2])) + + h5write(1.0, file_path, "scalar") + h5write(rnorm(VECTOR_LENGTH), file_path, "vector") + cat("Created test_empty_datasets.h5 (empty/scalar/vector)\n") +} + +generate_test_file_string_datasets <- function(dir) { + file_path <- file.path(dir, "test_string_datasets.h5") + h5createFile(file_path) + strings <- paste0("string_", 0:(STRING_ARRAY_LENGTH - 1)) + h5write(strings, file_path, "string_array") + cat("Created test_string_datasets.h5 (string datasets)\n") +} + +main <- function() { + # Check if working directory is "hdf5". Quit if not. + if (basename(getwd()) != "hdf5") { + cat("You must execute this script from the 'hdf5' directory!\n") + quit(status = 1) + } + + testdir <- "in" + if (!dir.exists(testdir)) { + dir.create(testdir) + } + + test_functions <- list( + generate_test_file_single_dataset, + generate_test_file_multiple_datasets, + generate_test_file_different_dtypes, + generate_test_file_chunked, + generate_test_file_compressed, + generate_test_file_multi_tensor_samples, + generate_test_file_nested_groups, + generate_test_file_with_attributes, + generate_test_file_empty_datasets, + generate_test_file_string_datasets + ) + + for (test_func in test_functions) { + tryCatch({ + test_func(testdir) + }, error = function(e) { + cat(sprintf(" ✗ Error: %s\n", conditionMessage(e))) + }) + } + + files <- sort(list.files(testdir, pattern = "\\.h5$", full.names = TRUE)) + cat(sprintf("\nGenerated %d HDF5 test files in %s\n", length(files), normalizePath(testdir))) +} + +if (!interactive()) { + main() +} \ No newline at end of file From 771a9649a4a1384ebd97778396712de2c74d9b10 Mon Sep 17 00:00:00 2001 From: Lucca Di Benedetto Date: Mon, 1 Dec 2025 16:22:46 +0100 Subject: [PATCH 2/4] HDF5 Reader comprehensive tests --- .../test/functions/io/hdf5/ReadHDF5Test.java | 216 ++++++++++++++++-- .../test/functions/io/hdf5/ReadHDF5Test1.java | 38 --- .../test/functions/io/hdf5/ReadHDF5Test2.java | 38 --- .../test/functions/io/hdf5/ReadHDF5Test3.java | 38 --- ...eadHDF5Test_3.dml => ReadHDF5_Default.dml} | 0 ...DF5Test_2.dml => ReadHDF5_WithDataset.dml} | 0 ....dml => ReadHDF5_WithFormatAndDataset.dml} | 0 .../functions/io/hdf5/gen_HDF5_testdata.R | 46 ++-- 8 files changed, 229 insertions(+), 147 deletions(-) delete mode 100644 src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test1.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test2.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test3.java rename src/test/scripts/functions/io/hdf5/{ReadHDF5Test_3.dml => ReadHDF5_Default.dml} (100%) rename src/test/scripts/functions/io/hdf5/{ReadHDF5Test_2.dml => ReadHDF5_WithDataset.dml} (100%) rename src/test/scripts/functions/io/hdf5/{ReadHDF5Test_1.dml => ReadHDF5_WithFormatAndDataset.dml} (100%) diff --git a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java b/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java index c20294cd85b..c1a7d37b986 100644 --- a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java +++ b/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java @@ -19,38 +19,133 @@ package org.apache.sysds.test.functions.io.hdf5; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; import java.util.HashMap; +import java.util.List; +import java.util.stream.Collectors; import org.apache.sysds.api.DMLScript; import org.apache.sysds.common.Types.ExecMode; +import org.apache.sysds.common.Types.FileFormat; +import org.apache.sysds.common.Types.ValueType; import org.apache.sysds.conf.CompilerConfig; import org.apache.sysds.runtime.matrix.data.MatrixValue; +import org.apache.sysds.runtime.meta.MatrixCharacteristics; +import org.apache.sysds.runtime.util.HDFSTool; import org.apache.sysds.test.TestConfiguration; import org.apache.sysds.test.TestUtils; +import org.junit.Assert; +import org.junit.BeforeClass; import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; -public abstract class ReadHDF5Test extends ReadHDF5TestBase { +@RunWith(Parameterized.class) +public class ReadHDF5Test extends ReadHDF5TestBase { - protected abstract int getId(); + private static final double eps = 1e-9; + private static final String TEST_NAME = "ReadHDF5Test"; - protected String getInputHDF5FileName() { - return "transfusion_" + getId() + ".h5"; + private static final int S_2D_ROWS = 200; + private static final int S_2D_COLS = 40; + private static final int S_ARRAY_LENGTH = 30; + private static final int MATRIX_3D_ROWS = 15; + private static final int MATRIX_3D_FLATTENED_COLS = 15 * 5; + private static final int MULTI_TENSOR_SAMPLES = 120; + private static final int MULTI_TENSOR_LABEL_FEATURES = 12; + private static final int MULTI_TENSOR_SEN1_FLATTENED_COLS = 16 * 16 * 4; + + private static final List TEST_CASES = Collections.unmodifiableList(Arrays.asList( + new Hdf5TestCase( + "test_single_dataset.h5", "data", DmlVariant.FORMAT_AND_DATASET, S_2D_ROWS, S_2D_COLS), + new Hdf5TestCase( + "test_multiple_datasets.h5", "matrix_2d", DmlVariant.DATASET_ONLY, S_2D_ROWS, S_2D_COLS), + new Hdf5TestCase( + "test_multiple_datasets.h5", "matrix_3d", DmlVariant.DATASET_ONLY, MATRIX_3D_ROWS, MATRIX_3D_FLATTENED_COLS), + new Hdf5TestCase( + "test_different_dtypes.h5", "double_primary", DmlVariant.DATASET_ONLY, S_2D_ROWS, S_2D_COLS), + new Hdf5TestCase( + "test_chunked.h5", "chunked_data", DmlVariant.FORMAT_AND_DATASET, S_2D_ROWS, S_2D_COLS), + new Hdf5TestCase( + "test_compressed.h5", "gzip_compressed_9", DmlVariant.DATASET_ONLY, S_2D_ROWS, S_2D_COLS), + new Hdf5TestCase( + "test_multi_tensor_samples.h5", "label", DmlVariant.DATASET_ONLY, MULTI_TENSOR_SAMPLES, MULTI_TENSOR_LABEL_FEATURES), + new Hdf5TestCase( + "test_multi_tensor_samples.h5", "sen1", DmlVariant.DATASET_ONLY, MULTI_TENSOR_SAMPLES, MULTI_TENSOR_SEN1_FLATTENED_COLS), + new Hdf5TestCase( + "test_nested_groups.h5", "group1/subgroup/data2", DmlVariant.FORMAT_AND_DATASET, S_2D_ROWS, S_2D_COLS), + new Hdf5TestCase( + "test_with_attributes.h5", "data", DmlVariant.DATASET_ONLY, S_2D_ROWS, S_2D_COLS), + new Hdf5TestCase( + "test_empty_datasets.h5", "empty", DmlVariant.FORMAT_AND_DATASET, 0, S_2D_COLS), + new Hdf5TestCase( + "test_string_datasets.h5", "string_array", DmlVariant.DATASET_ONLY, S_ARRAY_LENGTH, 1) + )); + + private final Hdf5TestCase testCase; + + public ReadHDF5Test(Hdf5TestCase testCase) { + this.testCase = testCase; } - private final static double eps = 1e-9; + @BeforeClass + public static void ensureHdf5DataGenerated() { + Path scriptDir = Paths.get(SCRIPT_DIR, TEST_DIR); + Path inputDir = scriptDir.resolve(INPUT_DIR); + boolean missingFiles = TEST_CASES.stream() + .anyMatch(tc -> Files.notExists(inputDir.resolve(tc.hdf5File))); + if(!missingFiles) + ensureMetadataFiles(inputDir); + else { + generateHdf5Data(scriptDir); - @Test - public void testHDF51_Seq_CP() { - runReadHDF5Test(getId(), ExecMode.SINGLE_NODE, false); + boolean stillMissing = TEST_CASES.stream() + .anyMatch(tc -> Files.notExists(inputDir.resolve(tc.hdf5File))); + if(stillMissing) + Assert.fail("Failed to generate required HDF5 files for ReadHDF5 tests."); + + ensureMetadataFiles(inputDir); + } + } + + @Parameters(name = "{0}") + public static Collection data() { + return TEST_CASES.stream() + .map(tc -> new Object[] {tc}) + .collect(Collectors.toList()); + } + + @Override + protected String getTestName() { + return TEST_NAME; + } + + @Override + protected String getTestClassDir() { + return TEST_CLASS_DIR; } @Test - public void testHDF51_Parallel_CP() { - runReadHDF5Test(getId(), ExecMode.SINGLE_NODE, true); + public void testReadSequential() { + runReadHDF5Test(testCase, ExecMode.SINGLE_NODE, false); } - protected void runReadHDF5Test(int testNumber, ExecMode platform, boolean parallel) { + @Test + public void testReadSequentialParallelIO() { + runReadHDF5Test(testCase, ExecMode.SINGLE_NODE, true); + } + protected void runReadHDF5Test(Hdf5TestCase testCase, ExecMode platform, boolean parallel) { ExecMode oldPlatform = rtplatform; rtplatform = platform; @@ -61,21 +156,19 @@ protected void runReadHDF5Test(int testNumber, ExecMode platform, boolean parall boolean oldpar = CompilerConfig.FLAG_PARREADWRITE_TEXT; try { - CompilerConfig.FLAG_PARREADWRITE_TEXT = parallel; TestConfiguration config = getTestConfiguration(getTestName()); loadTestConfiguration(config); String HOME = SCRIPT_DIR + TEST_DIR; - String inputMatrixName = HOME + INPUT_DIR + getInputHDF5FileName(); // always read the same data - String datasetName = "DATASET_1"; + String inputMatrixName = HOME + INPUT_DIR + testCase.hdf5File; - fullDMLScriptName = HOME + getTestName() + "_" + testNumber + ".dml"; - programArgs = new String[] {"-args", inputMatrixName, datasetName, output("Y")}; + fullDMLScriptName = HOME + testCase.variant.getScriptName(); + programArgs = new String[] {"-args", inputMatrixName, testCase.dataset, output("Y")}; fullRScriptName = HOME + "ReadHDF5_Verify.R"; - rCmd = "Rscript" + " " + fullRScriptName + " " + inputMatrixName + " " + datasetName + " " + expectedDir(); + rCmd = "Rscript" + " " + fullRScriptName + " " + inputMatrixName + " " + testCase.dataset + " " + expectedDir(); runTest(true, false, null, -1); runRScript(true); @@ -90,4 +183,93 @@ protected void runReadHDF5Test(int testNumber, ExecMode platform, boolean parall DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld; } } + + private static void generateHdf5Data(Path scriptDir) { + ProcessBuilder processBuilder = new ProcessBuilder("Rscript", "gen_HDF5_testdata.R"); + processBuilder.directory(scriptDir.toFile()); + processBuilder.redirectErrorStream(true); + + try { + Process process = processBuilder.start(); + StringBuilder output = new StringBuilder(); + try(BufferedReader reader = new BufferedReader( + new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) { + reader.lines().forEach(line -> output.append(line).append(System.lineSeparator())); + } + int exitCode = process.waitFor(); + if(exitCode != 0) + Assert.fail("Failed to execute gen_HDF5_testdata.R (exit " + exitCode + "):\n" + output); + } + catch(IOException e) { + Assert.fail("Unable to execute gen_HDF5_testdata.R: " + e.getMessage()); + } + catch(InterruptedException e) { + Thread.currentThread().interrupt(); + Assert.fail("Interrupted while generating HDF5 test data."); + } + } + + private static void ensureMetadataFiles(Path inputDir) { + try { + Files.createDirectories(inputDir); + for(Hdf5TestCase tc : TEST_CASES) { + Path mtdPath = inputDir.resolve(tc.getMtdFileName()); + if(Files.exists(mtdPath)) + continue; + + MatrixCharacteristics mc = new MatrixCharacteristics(tc.rows, tc.cols, tc.getNonZeros()); + HDFSTool.writeMetaDataFile(mtdPath.toString(), ValueType.FP64, mc, FileFormat.HDF5); + } + } + catch(IOException e) { + Assert.fail("Unable to create HDF5 metadata files: " + e.getMessage()); + } + } + + private enum DmlVariant { + FORMAT_AND_DATASET("ReadHDF5_WithFormatAndDataset.dml"), + DATASET_ONLY("ReadHDF5_WithDataset.dml"), + DEFAULT("ReadHDF5_Default.dml"); + + private final String scriptName; + + DmlVariant(String scriptName) { + this.scriptName = scriptName; + } + + public String getScriptName() { + return scriptName; + } + } + + private static final class Hdf5TestCase { + private final String hdf5File; + private final String dataset; + private final DmlVariant variant; + private final long rows; + private final long cols; + + private Hdf5TestCase(String hdf5File, String dataset, DmlVariant variant, long rows, long cols) { + this.hdf5File = hdf5File; + this.dataset = dataset; + this.variant = variant; + this.rows = rows; + this.cols = cols; + } + + private String getMtdFileName() { + return hdf5File + ".mtd"; + } + + private long getNonZeros() { + if(rows == 0 || cols == 0) + return 0; + return rows * cols; + } + + @Override + public String toString() { + return hdf5File + "::" + dataset; + } + } } diff --git a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test1.java b/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test1.java deleted file mode 100644 index b0fff7a6391..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test1.java +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.io.hdf5; - -public class ReadHDF5Test1 extends ReadHDF5Test { - - private final static String TEST_NAME = "ReadHDF5Test"; - public final static String TEST_CLASS_DIR = TEST_DIR + ReadHDF5Test1.class.getSimpleName() + "/"; - - protected String getTestName() { - return TEST_NAME; - } - - protected String getTestClassDir() { - return TEST_CLASS_DIR; - } - - protected int getId() { - return 1; - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test2.java b/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test2.java deleted file mode 100644 index d6a4c763c34..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test2.java +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.io.hdf5; - -public class ReadHDF5Test2 extends ReadHDF5Test { - - private final static String TEST_NAME = "ReadHDF5Test"; - private final static String TEST_CLASS_DIR = TEST_DIR + ReadHDF5Test2.class.getSimpleName() + "/"; - - protected String getTestName() { - return TEST_NAME; - } - - protected String getTestClassDir() { - return TEST_CLASS_DIR; - } - - protected int getId() { - return 2; - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test3.java b/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test3.java deleted file mode 100644 index 71a6b1762ec..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test3.java +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.io.hdf5; - -public class ReadHDF5Test3 extends ReadHDF5Test { - - private final static String TEST_NAME = "ReadHDF5Test"; - private final static String TEST_CLASS_DIR = TEST_DIR + ReadHDF5Test3.class.getSimpleName() + "/"; - - protected String getTestName() { - return TEST_NAME; - } - - protected String getTestClassDir() { - return TEST_CLASS_DIR; - } - - protected int getId() { - return 3; - } -} diff --git a/src/test/scripts/functions/io/hdf5/ReadHDF5Test_3.dml b/src/test/scripts/functions/io/hdf5/ReadHDF5_Default.dml similarity index 100% rename from src/test/scripts/functions/io/hdf5/ReadHDF5Test_3.dml rename to src/test/scripts/functions/io/hdf5/ReadHDF5_Default.dml diff --git a/src/test/scripts/functions/io/hdf5/ReadHDF5Test_2.dml b/src/test/scripts/functions/io/hdf5/ReadHDF5_WithDataset.dml similarity index 100% rename from src/test/scripts/functions/io/hdf5/ReadHDF5Test_2.dml rename to src/test/scripts/functions/io/hdf5/ReadHDF5_WithDataset.dml diff --git a/src/test/scripts/functions/io/hdf5/ReadHDF5Test_1.dml b/src/test/scripts/functions/io/hdf5/ReadHDF5_WithFormatAndDataset.dml similarity index 100% rename from src/test/scripts/functions/io/hdf5/ReadHDF5Test_1.dml rename to src/test/scripts/functions/io/hdf5/ReadHDF5_WithFormatAndDataset.dml diff --git a/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R b/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R index 22e94da1d41..a1f5c284270 100644 --- a/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R +++ b/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R @@ -29,26 +29,25 @@ if (!require("rhdf5", quietly = TRUE)) { } SMALL_MATRIX_2D <- c(200, 40) -LARGE_MATRIX_2D <- c(800, 30) SMALL_MATRIX_3D <- c(15, 15, 5) +SMALL_TENSOR_4D_A <- c(120, 16, 16, 4) +SMALL_TENSOR_4D_B <- c(120, 16, 16, 5) +SMALL_LABEL_MATRIX <- c(120, 12) VECTOR_LENGTH <- 200 STRING_ARRAY_LENGTH <- 30 CHUNK_SHAPE <- c(100, 20) -LARGE_CHUNK_SHAPE <- c(200, 30) - -TENSOR_DIMS <- list( - samples = 120, - height = 16, - width = 16, - channels_a = 4, - channels_b = 5, - label_features = 12 -) write_matrix <- function(file_path, dataset_name, shape, generator = function(n) rnorm(n)) { values <- generator(prod(shape)) + # Create dataset without compression, filters, or chunking to avoid message type 11 (Filter Pipeline) + # filter = "NONE": explicitly disable compression filters + # level = 0: no compression + # shuffle = FALSE: no shuffle filter + # chunk = dims: single chunk matching dataset size (effectively contiguous for small datasets) + h5createDataset(file_path, dataset_name, dims = shape, + filter = "NONE", level = 0, shuffle = FALSE, chunk = shape) h5write(array(values, dim = shape), file_path, dataset_name) } @@ -63,6 +62,9 @@ generate_test_file_multiple_datasets <- function(dir) { file_path <- file.path(dir, "test_multiple_datasets.h5") h5createFile(file_path) write_matrix(file_path, "matrix_2d", SMALL_MATRIX_2D) + # Create 1D vector without compression/filters + h5createDataset(file_path, "vector_1d", dims = VECTOR_LENGTH, + filter = "NONE", level = 0, shuffle = FALSE, chunk = VECTOR_LENGTH) h5write(rnorm(VECTOR_LENGTH), file_path, "vector_1d") write_matrix(file_path, "matrix_3d", SMALL_MATRIX_3D) cat("Created test_multiple_datasets.h5 (1D/2D/3D datasets)\n") @@ -94,7 +96,9 @@ generate_test_file_chunked <- function(dir) { h5createFile(file_path) data <- array(rnorm(prod(SMALL_MATRIX_2D)), dim = SMALL_MATRIX_2D) - h5createDataset(file_path, "chunked_data", dims = SMALL_MATRIX_2D, chunk = CHUNK_SHAPE) + # Chunked dataset without compression/filters (chunking is intentional for this test) + h5createDataset(file_path, "chunked_data", dims = SMALL_MATRIX_2D, chunk = CHUNK_SHAPE, + filter = "NONE", level = 0, shuffle = FALSE) h5write(data, file_path, "chunked_data") write_matrix(file_path, "non_chunked_data", SMALL_MATRIX_2D) @@ -120,17 +124,17 @@ generate_test_file_multi_tensor_samples <- function(dir) { write_matrix( file_path, "sen1", - c(TENSOR_DIMS$samples, TENSOR_DIMS$height, TENSOR_DIMS$width, TENSOR_DIMS$channels_a) + SMALL_TENSOR_4D_A ) write_matrix( file_path, "sen2", - c(TENSOR_DIMS$samples, TENSOR_DIMS$height, TENSOR_DIMS$width, TENSOR_DIMS$channels_b) + SMALL_TENSOR_4D_B ) write_matrix( file_path, "label", - c(TENSOR_DIMS$samples, TENSOR_DIMS$label_features), + SMALL_LABEL_MATRIX, generator = function(n) as.integer(sample(0:1, n, replace = TRUE)) ) cat("Created test_multi_tensor_samples.h5 (multi-input tensors)\n") @@ -168,9 +172,15 @@ generate_test_file_with_attributes <- function(dir) { generate_test_file_empty_datasets <- function(dir) { file_path <- file.path(dir, "test_empty_datasets.h5") h5createFile(file_path) - h5createDataset(file_path, "empty", dims = c(0, SMALL_MATRIX_2D[2])) + h5createDataset(file_path, "empty", dims = c(0, SMALL_MATRIX_2D[2]), + filter = "NONE", level = 0, shuffle = FALSE) + # Create scalar and vector without compression/filters + h5createDataset(file_path, "scalar", dims = 1, + filter = "NONE", level = 0, shuffle = FALSE, chunk = 1) h5write(1.0, file_path, "scalar") + h5createDataset(file_path, "vector", dims = VECTOR_LENGTH, + filter = "NONE", level = 0, shuffle = FALSE, chunk = VECTOR_LENGTH) h5write(rnorm(VECTOR_LENGTH), file_path, "vector") cat("Created test_empty_datasets.h5 (empty/scalar/vector)\n") } @@ -179,6 +189,10 @@ generate_test_file_string_datasets <- function(dir) { file_path <- file.path(dir, "test_string_datasets.h5") h5createFile(file_path) strings <- paste0("string_", 0:(STRING_ARRAY_LENGTH - 1)) + # Create string dataset without compression/filters + h5createDataset(file_path, "string_array", dims = STRING_ARRAY_LENGTH, + storage.mode = "character", filter = "NONE", level = 0, + shuffle = FALSE, chunk = STRING_ARRAY_LENGTH) h5write(strings, file_path, "string_array") cat("Created test_string_datasets.h5 (string datasets)\n") } From f6c833d59ead3d168ded936acde15eeb799c9ce4 Mon Sep 17 00:00:00 2001 From: Lucca Di Benedetto Date: Mon, 5 Jan 2026 18:20:06 +0100 Subject: [PATCH 3/4] clean up test --- .../test/functions/io/hdf5/ReadHDF5Test.java | 147 ++++-------------- .../functions/io/hdf5/ReadHDF5_Verify.R | 18 ++- .../functions/io/hdf5/gen_HDF5_testdata.R | 46 +++--- 3 files changed, 70 insertions(+), 141 deletions(-) diff --git a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java b/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java index c1a7d37b986..bd6af2fe066 100644 --- a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java +++ b/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java @@ -23,107 +23,37 @@ import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; -import java.nio.file.Files; +import java.io.File; +import org.apache.commons.io.FileUtils; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Arrays; -import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; -import java.util.stream.Collectors; import org.apache.sysds.api.DMLScript; import org.apache.sysds.common.Types.ExecMode; -import org.apache.sysds.common.Types.FileFormat; -import org.apache.sysds.common.Types.ValueType; import org.apache.sysds.conf.CompilerConfig; import org.apache.sysds.runtime.matrix.data.MatrixValue; -import org.apache.sysds.runtime.meta.MatrixCharacteristics; -import org.apache.sysds.runtime.util.HDFSTool; import org.apache.sysds.test.TestConfiguration; import org.apache.sysds.test.TestUtils; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; -import org.junit.runners.Parameterized.Parameters; -@RunWith(Parameterized.class) public class ReadHDF5Test extends ReadHDF5TestBase { private static final double eps = 1e-9; private static final String TEST_NAME = "ReadHDF5Test"; - private static final int S_2D_ROWS = 200; - private static final int S_2D_COLS = 40; - private static final int S_ARRAY_LENGTH = 30; - private static final int MATRIX_3D_ROWS = 15; - private static final int MATRIX_3D_FLATTENED_COLS = 15 * 5; - private static final int MULTI_TENSOR_SAMPLES = 120; - private static final int MULTI_TENSOR_LABEL_FEATURES = 12; - private static final int MULTI_TENSOR_SEN1_FLATTENED_COLS = 16 * 16 * 4; - - private static final List TEST_CASES = Collections.unmodifiableList(Arrays.asList( - new Hdf5TestCase( - "test_single_dataset.h5", "data", DmlVariant.FORMAT_AND_DATASET, S_2D_ROWS, S_2D_COLS), - new Hdf5TestCase( - "test_multiple_datasets.h5", "matrix_2d", DmlVariant.DATASET_ONLY, S_2D_ROWS, S_2D_COLS), - new Hdf5TestCase( - "test_multiple_datasets.h5", "matrix_3d", DmlVariant.DATASET_ONLY, MATRIX_3D_ROWS, MATRIX_3D_FLATTENED_COLS), - new Hdf5TestCase( - "test_different_dtypes.h5", "double_primary", DmlVariant.DATASET_ONLY, S_2D_ROWS, S_2D_COLS), - new Hdf5TestCase( - "test_chunked.h5", "chunked_data", DmlVariant.FORMAT_AND_DATASET, S_2D_ROWS, S_2D_COLS), - new Hdf5TestCase( - "test_compressed.h5", "gzip_compressed_9", DmlVariant.DATASET_ONLY, S_2D_ROWS, S_2D_COLS), - new Hdf5TestCase( - "test_multi_tensor_samples.h5", "label", DmlVariant.DATASET_ONLY, MULTI_TENSOR_SAMPLES, MULTI_TENSOR_LABEL_FEATURES), - new Hdf5TestCase( - "test_multi_tensor_samples.h5", "sen1", DmlVariant.DATASET_ONLY, MULTI_TENSOR_SAMPLES, MULTI_TENSOR_SEN1_FLATTENED_COLS), - new Hdf5TestCase( - "test_nested_groups.h5", "group1/subgroup/data2", DmlVariant.FORMAT_AND_DATASET, S_2D_ROWS, S_2D_COLS), - new Hdf5TestCase( - "test_with_attributes.h5", "data", DmlVariant.DATASET_ONLY, S_2D_ROWS, S_2D_COLS), - new Hdf5TestCase( - "test_empty_datasets.h5", "empty", DmlVariant.FORMAT_AND_DATASET, 0, S_2D_COLS), - new Hdf5TestCase( - "test_string_datasets.h5", "string_array", DmlVariant.DATASET_ONLY, S_ARRAY_LENGTH, 1) - )); - - private final Hdf5TestCase testCase; - - public ReadHDF5Test(Hdf5TestCase testCase) { - this.testCase = testCase; - } - - @BeforeClass - public static void ensureHdf5DataGenerated() { - Path scriptDir = Paths.get(SCRIPT_DIR, TEST_DIR); - Path inputDir = scriptDir.resolve(INPUT_DIR); - boolean missingFiles = TEST_CASES.stream() - .anyMatch(tc -> Files.notExists(inputDir.resolve(tc.hdf5File))); - if(!missingFiles) - ensureMetadataFiles(inputDir); - else { - generateHdf5Data(scriptDir); - - boolean stillMissing = TEST_CASES.stream() - .anyMatch(tc -> Files.notExists(inputDir.resolve(tc.hdf5File))); - if(stillMissing) - Assert.fail("Failed to generate required HDF5 files for ReadHDF5 tests."); - - ensureMetadataFiles(inputDir); - } - } - - @Parameters(name = "{0}") - public static Collection data() { - return TEST_CASES.stream() - .map(tc -> new Object[] {tc}) - .collect(Collectors.toList()); - } + private static final List TEST_CASES = Collections.unmodifiableList( + Arrays.asList(new Hdf5TestCase("test_single_dataset.h5", "data", DmlVariant.FORMAT_AND_DATASET), + new Hdf5TestCase("test_multiple_datasets.h5", "matrix_2d", DmlVariant.DATASET_ONLY), + new Hdf5TestCase("test_multiple_datasets.h5", "matrix_3d", DmlVariant.DATASET_ONLY), + new Hdf5TestCase("test_multi_tensor_samples.h5", "label", DmlVariant.DATASET_ONLY), + new Hdf5TestCase("test_multi_tensor_samples.h5", "sen1", DmlVariant.DATASET_ONLY), + new Hdf5TestCase("test_nested_groups.h5", "group1/subgroup/data2", DmlVariant.FORMAT_AND_DATASET))); @Override protected String getTestName() { @@ -135,14 +65,22 @@ protected String getTestClassDir() { return TEST_CLASS_DIR; } + @BeforeClass + public static void setUpClass() { + Path scriptDir = Paths.get(SCRIPT_DIR + TEST_DIR); + generateHdf5Data(scriptDir); + } + @Test public void testReadSequential() { - runReadHDF5Test(testCase, ExecMode.SINGLE_NODE, false); + for(Hdf5TestCase tc : TEST_CASES) + runReadHDF5Test(tc, ExecMode.SINGLE_NODE, false); } @Test public void testReadSequentialParallelIO() { - runReadHDF5Test(testCase, ExecMode.SINGLE_NODE, true); + for(Hdf5TestCase tc : TEST_CASES) + runReadHDF5Test(tc, ExecMode.SINGLE_NODE, true); } protected void runReadHDF5Test(Hdf5TestCase testCase, ExecMode platform, boolean parallel) { @@ -167,8 +105,17 @@ protected void runReadHDF5Test(Hdf5TestCase testCase, ExecMode platform, boolean fullDMLScriptName = HOME + testCase.variant.getScriptName(); programArgs = new String[] {"-args", inputMatrixName, testCase.dataset, output("Y")}; + // Clean per-case output/expected to avoid reusing stale metadata between looped cases + String outY = output("Y"); + String expY = expected("Y"); + FileUtils.deleteQuietly(new File(outY)); + FileUtils.deleteQuietly(new File(outY + ".mtd")); + FileUtils.deleteQuietly(new File(expY)); + FileUtils.deleteQuietly(new File(expY + ".mtd")); + fullRScriptName = HOME + "ReadHDF5_Verify.R"; - rCmd = "Rscript" + " " + fullRScriptName + " " + inputMatrixName + " " + testCase.dataset + " " + expectedDir(); + rCmd = "Rscript" + " " + fullRScriptName + " " + inputMatrixName + " " + testCase.dataset + " " + + expectedDir(); runTest(true, false, null, -1); runRScript(true); @@ -209,26 +156,8 @@ private static void generateHdf5Data(Path scriptDir) { } } - private static void ensureMetadataFiles(Path inputDir) { - try { - Files.createDirectories(inputDir); - for(Hdf5TestCase tc : TEST_CASES) { - Path mtdPath = inputDir.resolve(tc.getMtdFileName()); - if(Files.exists(mtdPath)) - continue; - - MatrixCharacteristics mc = new MatrixCharacteristics(tc.rows, tc.cols, tc.getNonZeros()); - HDFSTool.writeMetaDataFile(mtdPath.toString(), ValueType.FP64, mc, FileFormat.HDF5); - } - } - catch(IOException e) { - Assert.fail("Unable to create HDF5 metadata files: " + e.getMessage()); - } - } - private enum DmlVariant { - FORMAT_AND_DATASET("ReadHDF5_WithFormatAndDataset.dml"), - DATASET_ONLY("ReadHDF5_WithDataset.dml"), + FORMAT_AND_DATASET("ReadHDF5_WithFormatAndDataset.dml"), DATASET_ONLY("ReadHDF5_WithDataset.dml"), DEFAULT("ReadHDF5_Default.dml"); private final String scriptName; @@ -246,25 +175,11 @@ private static final class Hdf5TestCase { private final String hdf5File; private final String dataset; private final DmlVariant variant; - private final long rows; - private final long cols; - private Hdf5TestCase(String hdf5File, String dataset, DmlVariant variant, long rows, long cols) { + private Hdf5TestCase(String hdf5File, String dataset, DmlVariant variant) { this.hdf5File = hdf5File; this.dataset = dataset; this.variant = variant; - this.rows = rows; - this.cols = cols; - } - - private String getMtdFileName() { - return hdf5File + ".mtd"; - } - - private long getNonZeros() { - if(rows == 0 || cols == 0) - return 0; - return rows * cols; } @Override diff --git a/src/test/scripts/functions/io/hdf5/ReadHDF5_Verify.R b/src/test/scripts/functions/io/hdf5/ReadHDF5_Verify.R index 2b977007dd2..925e092f724 100644 --- a/src/test/scripts/functions/io/hdf5/ReadHDF5_Verify.R +++ b/src/test/scripts/functions/io/hdf5/ReadHDF5_Verify.R @@ -26,5 +26,19 @@ options(digits=22) library("rhdf5") -Y = h5read(args[1],args[2],native = TRUE) -writeMM(as(Y, "CsparseMatrix"), paste(args[3], "Y", sep="")) +Y = h5read(args[1], args[2], native = TRUE) +dims = dim(Y) + +if(length(dims) == 1) { + # convert to a column matrix + Y_mat = matrix(Y, ncol = 1) +} else if(length(dims) > 2) { + # flatten everything beyond the first dimension into columns + perm = c(1, rev(seq(2, length(dims)))) + Y_mat = matrix(aperm(Y, perm), nrow = dims[1], ncol = prod(dims[-1])) +} else { + # for 2d , systemds treats it the same + Y_mat = Y +} + +writeMM(as(Y_mat, "CsparseMatrix"), paste(args[3], "Y", sep="")) diff --git a/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R b/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R index a1f5c284270..7d4f56c8811 100644 --- a/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R +++ b/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -41,14 +41,17 @@ CHUNK_SHAPE <- c(100, 20) write_matrix <- function(file_path, dataset_name, shape, generator = function(n) rnorm(n)) { values <- generator(prod(shape)) - # Create dataset without compression, filters, or chunking to avoid message type 11 (Filter Pipeline) - # filter = "NONE": explicitly disable compression filters - # level = 0: no compression - # shuffle = FALSE: no shuffle filter - # chunk = dims: single chunk matching dataset size (effectively contiguous for small datasets) - h5createDataset(file_path, dataset_name, dims = shape, - filter = "NONE", level = 0, shuffle = FALSE, chunk = shape) - h5write(array(values, dim = shape), file_path, dataset_name) + h5createDataset( + file_path, + dataset_name, + dims = rev(shape), + chunk = NULL, + filter = "NONE", # contiguous, uncompressed layout + level = 0, + shuffle = FALSE, + native = TRUE # use R column-major order, same in h5read(..., native=TRUE) in tests. + ) + h5write(array(values, dim = shape), file_path, dataset_name, native = TRUE) } generate_test_file_single_dataset <- function(dir) { @@ -63,9 +66,8 @@ generate_test_file_multiple_datasets <- function(dir) { h5createFile(file_path) write_matrix(file_path, "matrix_2d", SMALL_MATRIX_2D) # Create 1D vector without compression/filters - h5createDataset(file_path, "vector_1d", dims = VECTOR_LENGTH, - filter = "NONE", level = 0, shuffle = FALSE, chunk = VECTOR_LENGTH) - h5write(rnorm(VECTOR_LENGTH), file_path, "vector_1d") + h5createDataset(file_path, "vector_1d", dims = VECTOR_LENGTH, chunk = NULL, filter = "NONE", level = 0, shuffle = FALSE) + h5write(rnorm(VECTOR_LENGTH), file_path, "vector_1d", native = TRUE) write_matrix(file_path, "matrix_3d", SMALL_MATRIX_3D) cat("Created test_multiple_datasets.h5 (1D/2D/3D datasets)\n") } @@ -96,10 +98,10 @@ generate_test_file_chunked <- function(dir) { h5createFile(file_path) data <- array(rnorm(prod(SMALL_MATRIX_2D)), dim = SMALL_MATRIX_2D) - # Chunked dataset without compression/filters (chunking is intentional for this test) + h5createDataset(file_path, "chunked_data", dims = SMALL_MATRIX_2D, chunk = CHUNK_SHAPE, filter = "NONE", level = 0, shuffle = FALSE) - h5write(data, file_path, "chunked_data") + h5write(data, file_path, "chunked_data", native = TRUE) write_matrix(file_path, "non_chunked_data", SMALL_MATRIX_2D) cat("Created test_chunked.h5 (chunked dataset)\n") @@ -111,10 +113,10 @@ generate_test_file_compressed <- function(dir) { data <- array(rnorm(prod(SMALL_MATRIX_2D)), dim = SMALL_MATRIX_2D) h5createDataset(file_path, "gzip_compressed_9", dims = SMALL_MATRIX_2D, chunk = SMALL_MATRIX_2D, level = 9) - h5write(data, file_path, "gzip_compressed_9") + h5write(data, file_path, "gzip_compressed_9", native = TRUE) h5createDataset(file_path, "gzip_compressed_1", dims = SMALL_MATRIX_2D, chunk = SMALL_MATRIX_2D, level = 1) - h5write(data, file_path, "gzip_compressed_1") + h5write(data, file_path, "gzip_compressed_1", native = TRUE) cat("Created test_compressed.h5 (gzip compression)\n") } @@ -175,13 +177,12 @@ generate_test_file_empty_datasets <- function(dir) { h5createDataset(file_path, "empty", dims = c(0, SMALL_MATRIX_2D[2]), filter = "NONE", level = 0, shuffle = FALSE) - # Create scalar and vector without compression/filters h5createDataset(file_path, "scalar", dims = 1, filter = "NONE", level = 0, shuffle = FALSE, chunk = 1) - h5write(1.0, file_path, "scalar") + h5write(1.0, file_path, "scalar", native = TRUE) h5createDataset(file_path, "vector", dims = VECTOR_LENGTH, filter = "NONE", level = 0, shuffle = FALSE, chunk = VECTOR_LENGTH) - h5write(rnorm(VECTOR_LENGTH), file_path, "vector") + h5write(rnorm(VECTOR_LENGTH), file_path, "vector", native = TRUE) cat("Created test_empty_datasets.h5 (empty/scalar/vector)\n") } @@ -193,14 +194,13 @@ generate_test_file_string_datasets <- function(dir) { h5createDataset(file_path, "string_array", dims = STRING_ARRAY_LENGTH, storage.mode = "character", filter = "NONE", level = 0, shuffle = FALSE, chunk = STRING_ARRAY_LENGTH) - h5write(strings, file_path, "string_array") + h5write(strings, file_path, "string_array", native = TRUE) cat("Created test_string_datasets.h5 (string datasets)\n") } main <- function() { - # Check if working directory is "hdf5". Quit if not. if (basename(getwd()) != "hdf5") { - cat("You must execute this script from the 'hdf5' directory!\n") + cat("You must execute this script from the 'hdf5' directory\n") quit(status = 1) } From be9247fdcc555dd2db190a40b26c8268ff19089a Mon Sep 17 00:00:00 2001 From: Lucca Di Benedetto Date: Mon, 5 Jan 2026 18:21:10 +0100 Subject: [PATCH 4/4] fix datatype test input --- .../functions/io/hdf5/gen_HDF5_testdata.R | 52 +++++++++++-------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R b/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R index 7d4f56c8811..fb9fed140ab 100644 --- a/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R +++ b/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -39,7 +39,7 @@ STRING_ARRAY_LENGTH <- 30 CHUNK_SHAPE <- c(100, 20) -write_matrix <- function(file_path, dataset_name, shape, generator = function(n) rnorm(n)) { +write_matrix <- function(file_path, dataset_name, shape, generator = function(n) rnorm(n), storage.mode = "double", H5type = NULL) { values <- generator(prod(shape)) h5createDataset( file_path, @@ -49,6 +49,8 @@ write_matrix <- function(file_path, dataset_name, shape, generator = function(n) filter = "NONE", # contiguous, uncompressed layout level = 0, shuffle = FALSE, + storage.mode = storage.mode, + H5type = H5type, native = TRUE # use R column-major order, same in h5read(..., native=TRUE) in tests. ) h5write(array(values, dim = shape), file_path, dataset_name, native = TRUE) @@ -75,21 +77,27 @@ generate_test_file_multiple_datasets <- function(dir) { generate_test_file_different_dtypes <- function(dir) { file_path <- file.path(dir, "test_different_dtypes.h5") h5createFile(file_path) - write_matrix(file_path, "double_primary", SMALL_MATRIX_2D) - write_matrix(file_path, "double_secondary", SMALL_MATRIX_2D) + # H5T_IEEE_F64LE (64-bit float) + write_matrix(file_path, "double_primary", SMALL_MATRIX_2D, storage.mode = "double") + # H5T_IEEE_F32LE (32-bit float) + write_matrix(file_path, "float32", SMALL_MATRIX_2D, H5type = "H5T_IEEE_F32LE") + # H5T_STD_I32LE (32-bit integer) write_matrix( file_path, "int32", SMALL_MATRIX_2D, - generator = function(n) as.integer(sample(-100:100, n, replace = TRUE)) + generator = function(n) as.integer(sample(-100:100, n, replace = TRUE)), + storage.mode = "integer" ) + # H5T_STD_I64LE (64-bit integer) write_matrix( file_path, - "int32_alt", + "int64", SMALL_MATRIX_2D, - generator = function(n) as.integer(sample(-100:100, n, replace = TRUE)) + generator = function(n) as.integer(sample(-100:100, n, replace = TRUE)), + H5type = "H5T_STD_I64LE" ) - cat("Created test_different_dtypes.h5 (double/int datasets)\n") + cat("Created test_different_dtypes.h5 (double/float/int32/int64 datasets)\n") } # https://support.hdfgroup.org/documentation/hdf5-docs/advanced_topics/chunking_in_hdf5.html @@ -98,7 +106,7 @@ generate_test_file_chunked <- function(dir) { h5createFile(file_path) data <- array(rnorm(prod(SMALL_MATRIX_2D)), dim = SMALL_MATRIX_2D) - + h5createDataset(file_path, "chunked_data", dims = SMALL_MATRIX_2D, chunk = CHUNK_SHAPE, filter = "NONE", level = 0, shuffle = FALSE) h5write(data, file_path, "chunked_data", native = TRUE) @@ -111,10 +119,10 @@ generate_test_file_compressed <- function(dir) { file_path <- file.path(dir, "test_compressed.h5") h5createFile(file_path) data <- array(rnorm(prod(SMALL_MATRIX_2D)), dim = SMALL_MATRIX_2D) - h5createDataset(file_path, "gzip_compressed_9", dims = SMALL_MATRIX_2D, + h5createDataset(file_path, "gzip_compressed_9", dims = SMALL_MATRIX_2D, chunk = SMALL_MATRIX_2D, level = 9) h5write(data, file_path, "gzip_compressed_9", native = TRUE) - h5createDataset(file_path, "gzip_compressed_1", dims = SMALL_MATRIX_2D, + h5createDataset(file_path, "gzip_compressed_1", dims = SMALL_MATRIX_2D, chunk = SMALL_MATRIX_2D, level = 1) h5write(data, file_path, "gzip_compressed_1", native = TRUE) cat("Created test_compressed.h5 (gzip compression)\n") @@ -174,13 +182,13 @@ generate_test_file_with_attributes <- function(dir) { generate_test_file_empty_datasets <- function(dir) { file_path <- file.path(dir, "test_empty_datasets.h5") h5createFile(file_path) - h5createDataset(file_path, "empty", dims = c(0, SMALL_MATRIX_2D[2]), + h5createDataset(file_path, "empty", dims = c(0, SMALL_MATRIX_2D[2]), filter = "NONE", level = 0, shuffle = FALSE) - h5createDataset(file_path, "scalar", dims = 1, + h5createDataset(file_path, "scalar", dims = 1, filter = "NONE", level = 0, shuffle = FALSE, chunk = 1) h5write(1.0, file_path, "scalar", native = TRUE) - h5createDataset(file_path, "vector", dims = VECTOR_LENGTH, + h5createDataset(file_path, "vector", dims = VECTOR_LENGTH, filter = "NONE", level = 0, shuffle = FALSE, chunk = VECTOR_LENGTH) h5write(rnorm(VECTOR_LENGTH), file_path, "vector", native = TRUE) cat("Created test_empty_datasets.h5 (empty/scalar/vector)\n") @@ -191,8 +199,8 @@ generate_test_file_string_datasets <- function(dir) { h5createFile(file_path) strings <- paste0("string_", 0:(STRING_ARRAY_LENGTH - 1)) # Create string dataset without compression/filters - h5createDataset(file_path, "string_array", dims = STRING_ARRAY_LENGTH, - storage.mode = "character", filter = "NONE", level = 0, + h5createDataset(file_path, "string_array", dims = STRING_ARRAY_LENGTH, + storage.mode = "character", filter = "NONE", level = 0, shuffle = FALSE, chunk = STRING_ARRAY_LENGTH) h5write(strings, file_path, "string_array", native = TRUE) cat("Created test_string_datasets.h5 (string datasets)\n") @@ -203,12 +211,12 @@ main <- function() { cat("You must execute this script from the 'hdf5' directory\n") quit(status = 1) } - + testdir <- "in" if (!dir.exists(testdir)) { dir.create(testdir) } - + test_functions <- list( generate_test_file_single_dataset, generate_test_file_multiple_datasets, @@ -221,7 +229,7 @@ main <- function() { generate_test_file_empty_datasets, generate_test_file_string_datasets ) - + for (test_func in test_functions) { tryCatch({ test_func(testdir) @@ -229,11 +237,11 @@ main <- function() { cat(sprintf(" ✗ Error: %s\n", conditionMessage(e))) }) } - + files <- sort(list.files(testdir, pattern = "\\.h5$", full.names = TRUE)) cat(sprintf("\nGenerated %d HDF5 test files in %s\n", length(files), normalizePath(testdir))) } if (!interactive()) { main() -} \ No newline at end of file +}