From 9b69e5b1c95f42a20d88388a0d093d17ce1bcfe6 Mon Sep 17 00:00:00 2001
From: Lucca Di Benedetto <luccadibe@gmail.com>
Date: Mon, 1 Dec 2025 13:26:02 +0100
Subject: [PATCH 1/4] test data generator for HDF5 Readers

---
 .../functions/io/hdf5/gen_HDF5_testdata.R     | 225 ++++++++++++++++++
 1 file changed, 225 insertions(+)
 create mode 100644 src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R

diff --git a/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R b/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R
new file mode 100644
index 00000000000..22e94da1d41
--- /dev/null
+++ b/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R
@@ -0,0 +1,225 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+# Generate various HDF5 test files with different formats.
+# Creates test files in the 'in' directory.
+
+if (!require("rhdf5", quietly = TRUE)) {
+  cat("Error: rhdf5 is not installed.\n")
+  quit(status = 1)
+}
+
+SMALL_MATRIX_2D <- c(200, 40)
+LARGE_MATRIX_2D <- c(800, 30)
+SMALL_MATRIX_3D <- c(15, 15, 5)
+
+VECTOR_LENGTH <- 200
+STRING_ARRAY_LENGTH <- 30
+
+CHUNK_SHAPE <- c(100, 20)
+LARGE_CHUNK_SHAPE <- c(200, 30)
+
+TENSOR_DIMS <- list(
+  samples = 120,
+  height = 16,
+  width = 16,
+  channels_a = 4,
+  channels_b = 5,
+  label_features = 12
+)
+
+write_matrix <- function(file_path, dataset_name, shape, generator = function(n) rnorm(n)) {
+  values <- generator(prod(shape))
+  h5write(array(values, dim = shape), file_path, dataset_name)
+}
+
+generate_test_file_single_dataset <- function(dir) {
+  file_path <- file.path(dir, "test_single_dataset.h5")
+  h5createFile(file_path)
+  write_matrix(file_path, "data", SMALL_MATRIX_2D)
+  cat("Created test_single_dataset.h5 (single 2D dataset)\n")
+}
+
+generate_test_file_multiple_datasets <- function(dir) {
+  file_path <- file.path(dir, "test_multiple_datasets.h5")
+  h5createFile(file_path)
+  write_matrix(file_path, "matrix_2d", SMALL_MATRIX_2D)
+  h5write(rnorm(VECTOR_LENGTH), file_path, "vector_1d")
+  write_matrix(file_path, "matrix_3d", SMALL_MATRIX_3D)
+  cat("Created test_multiple_datasets.h5 (1D/2D/3D datasets)\n")
+}
+
+generate_test_file_different_dtypes <- function(dir) {
+  file_path <- file.path(dir, "test_different_dtypes.h5")
+  h5createFile(file_path)
+  write_matrix(file_path, "double_primary", SMALL_MATRIX_2D)
+  write_matrix(file_path, "double_secondary", SMALL_MATRIX_2D)
+  write_matrix(
+    file_path,
+    "int32",
+    SMALL_MATRIX_2D,
+    generator = function(n) as.integer(sample(-100:100, n, replace = TRUE))
+  )
+  write_matrix(
+    file_path,
+    "int32_alt",
+    SMALL_MATRIX_2D,
+    generator = function(n) as.integer(sample(-100:100, n, replace = TRUE))
+  )
+  cat("Created test_different_dtypes.h5 (double/int datasets)\n")
+}
+
+# https://support.hdfgroup.org/documentation/hdf5-docs/advanced_topics/chunking_in_hdf5.html
+generate_test_file_chunked <- function(dir) {
+  file_path <- file.path(dir, "test_chunked.h5")
+  h5createFile(file_path)
+
+  data <- array(rnorm(prod(SMALL_MATRIX_2D)), dim = SMALL_MATRIX_2D)
+  h5createDataset(file_path, "chunked_data", dims = SMALL_MATRIX_2D, chunk = CHUNK_SHAPE)
+  h5write(data, file_path, "chunked_data")
+
+  write_matrix(file_path, "non_chunked_data", SMALL_MATRIX_2D)
+  cat("Created test_chunked.h5 (chunked dataset)\n")
+}
+
+generate_test_file_compressed <- function(dir) {
+  file_path <- file.path(dir, "test_compressed.h5")
+  h5createFile(file_path)
+  data <- array(rnorm(prod(SMALL_MATRIX_2D)), dim = SMALL_MATRIX_2D)
+  h5createDataset(file_path, "gzip_compressed_9", dims = SMALL_MATRIX_2D, 
+                  chunk = SMALL_MATRIX_2D, level = 9)
+  h5write(data, file_path, "gzip_compressed_9")
+  h5createDataset(file_path, "gzip_compressed_1", dims = SMALL_MATRIX_2D, 
+                  chunk = SMALL_MATRIX_2D, level = 1)
+  h5write(data, file_path, "gzip_compressed_1")
+  cat("Created test_compressed.h5 (gzip compression)\n")
+}
+
+generate_test_file_multi_tensor_samples <- function(dir) {
+  file_path <- file.path(dir, "test_multi_tensor_samples.h5")
+  h5createFile(file_path)
+  write_matrix(
+    file_path,
+    "sen1",
+    c(TENSOR_DIMS$samples, TENSOR_DIMS$height, TENSOR_DIMS$width, TENSOR_DIMS$channels_a)
+  )
+  write_matrix(
+    file_path,
+    "sen2",
+    c(TENSOR_DIMS$samples, TENSOR_DIMS$height, TENSOR_DIMS$width, TENSOR_DIMS$channels_b)
+  )
+  write_matrix(
+    file_path,
+    "label",
+    c(TENSOR_DIMS$samples, TENSOR_DIMS$label_features),
+    generator = function(n) as.integer(sample(0:1, n, replace = TRUE))
+  )
+  cat("Created test_multi_tensor_samples.h5 (multi-input tensors)\n")
+}
+
+generate_test_file_nested_groups <- function(dir) {
+  file_path <- file.path(dir, "test_nested_groups.h5")
+  h5createFile(file_path)
+  write_matrix(file_path, "root_data", SMALL_MATRIX_2D)
+  h5createGroup(file_path, "group1")
+  write_matrix(file_path, "group1/data1", SMALL_MATRIX_2D)
+  h5createGroup(file_path, "group1/subgroup")
+  write_matrix(file_path, "group1/subgroup/data2", SMALL_MATRIX_2D)
+  cat("Created test_nested_groups.h5 (nested group hierarchy)\n")
+}
+
+generate_test_file_with_attributes <- function(dir) {
+  file_path <- file.path(dir, "test_with_attributes.h5")
+  h5createFile(file_path)
+  write_matrix(file_path, "data", SMALL_MATRIX_2D)
+
+  fid <- H5Fopen(file_path)
+  did <- H5Dopen(fid, "data")
+  h5writeAttribute("Test dataset with attributes", did, "description")
+  h5writeAttribute(1.0, did, "version")
+  h5writeAttribute(SMALL_MATRIX_2D, did, "shape")
+  H5Dclose(did)
+
+  h5writeAttribute("2025-11-26", fid, "file_created")
+  h5writeAttribute("attributes", fid, "test_type")
+  H5Fclose(fid)
+  cat("Created test_with_attributes.h5 (dataset + file attributes)\n")
+}
+
+generate_test_file_empty_datasets <- function(dir) {
+  file_path <- file.path(dir, "test_empty_datasets.h5")
+  h5createFile(file_path)
+  h5createDataset(file_path, "empty", dims = c(0, SMALL_MATRIX_2D[2]))
+
+  h5write(1.0, file_path, "scalar")
+  h5write(rnorm(VECTOR_LENGTH), file_path, "vector")
+  cat("Created test_empty_datasets.h5 (empty/scalar/vector)\n")
+}
+
+generate_test_file_string_datasets <- function(dir) {
+  file_path <- file.path(dir, "test_string_datasets.h5")
+  h5createFile(file_path)
+  strings <- paste0("string_", 0:(STRING_ARRAY_LENGTH - 1))
+  h5write(strings, file_path, "string_array")
+  cat("Created test_string_datasets.h5 (string datasets)\n")
+}
+
+main <- function() {
+  # Check if working directory is "hdf5". Quit if not.
+  if (basename(getwd()) != "hdf5") {
+    cat("You must execute this script from the 'hdf5' directory!\n")
+    quit(status = 1)
+  }
+  
+  testdir <- "in"
+  if (!dir.exists(testdir)) {
+    dir.create(testdir)
+  }
+  
+  test_functions <- list(
+    generate_test_file_single_dataset,
+    generate_test_file_multiple_datasets,
+    generate_test_file_different_dtypes,
+    generate_test_file_chunked,
+    generate_test_file_compressed,
+    generate_test_file_multi_tensor_samples,
+    generate_test_file_nested_groups,
+    generate_test_file_with_attributes,
+    generate_test_file_empty_datasets,
+    generate_test_file_string_datasets
+  )
+  
+  for (test_func in test_functions) {
+    tryCatch({
+      test_func(testdir)
+    }, error = function(e) {
+      cat(sprintf("  ✗ Error: %s\n", conditionMessage(e)))
+    })
+  }
+  
+  files <- sort(list.files(testdir, pattern = "\\.h5$", full.names = TRUE))
+  cat(sprintf("\nGenerated %d HDF5 test files in %s\n", length(files), normalizePath(testdir)))
+}
+
+if (!interactive()) {
+  main()
+}
\ No newline at end of file

From 771a9649a4a1384ebd97778396712de2c74d9b10 Mon Sep 17 00:00:00 2001
From: Lucca Di Benedetto <luccadibe@gmail.com>
Date: Mon, 1 Dec 2025 16:22:46 +0100
Subject: [PATCH 2/4] HDF5 Reader comprehensive tests

---
 .../test/functions/io/hdf5/ReadHDF5Test.java  | 216 ++++++++++++++++--
 .../test/functions/io/hdf5/ReadHDF5Test1.java |  38 ---
 .../test/functions/io/hdf5/ReadHDF5Test2.java |  38 ---
 .../test/functions/io/hdf5/ReadHDF5Test3.java |  38 ---
 ...eadHDF5Test_3.dml => ReadHDF5_Default.dml} |   0
 ...DF5Test_2.dml => ReadHDF5_WithDataset.dml} |   0
 ....dml => ReadHDF5_WithFormatAndDataset.dml} |   0
 .../functions/io/hdf5/gen_HDF5_testdata.R     |  46 ++--
 8 files changed, 229 insertions(+), 147 deletions(-)
 delete mode 100644 src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test1.java
 delete mode 100644 src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test2.java
 delete mode 100644 src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test3.java
 rename src/test/scripts/functions/io/hdf5/{ReadHDF5Test_3.dml => ReadHDF5_Default.dml} (100%)
 rename src/test/scripts/functions/io/hdf5/{ReadHDF5Test_2.dml => ReadHDF5_WithDataset.dml} (100%)
 rename src/test/scripts/functions/io/hdf5/{ReadHDF5Test_1.dml => ReadHDF5_WithFormatAndDataset.dml} (100%)

diff --git a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java b/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java
index c20294cd85b..c1a7d37b986 100644
--- a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java
+++ b/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java
@@ -19,38 +19,133 @@
 
 package org.apache.sysds.test.functions.io.hdf5;
 
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
 import java.util.HashMap;
+import java.util.List;
+import java.util.stream.Collectors;
 
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.common.Types.FileFormat;
+import org.apache.sysds.common.Types.ValueType;
 import org.apache.sysds.conf.CompilerConfig;
 import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.runtime.meta.MatrixCharacteristics;
+import org.apache.sysds.runtime.util.HDFSTool;
 import org.apache.sysds.test.TestConfiguration;
 import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
 import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
 
-public abstract class ReadHDF5Test extends ReadHDF5TestBase {
+@RunWith(Parameterized.class)
+public class ReadHDF5Test extends ReadHDF5TestBase {
 
-	protected abstract int getId();
+	private static final double eps = 1e-9;
+	private static final String TEST_NAME = "ReadHDF5Test";
 
-	protected String getInputHDF5FileName() {
-		return "transfusion_" + getId() + ".h5";
+	private static final int S_2D_ROWS = 200;
+	private static final int S_2D_COLS = 40;
+	private static final int S_ARRAY_LENGTH = 30;
+	private static final int MATRIX_3D_ROWS = 15;
+	private static final int MATRIX_3D_FLATTENED_COLS = 15 * 5;
+	private static final int MULTI_TENSOR_SAMPLES = 120;
+	private static final int MULTI_TENSOR_LABEL_FEATURES = 12;
+	private static final int MULTI_TENSOR_SEN1_FLATTENED_COLS = 16 * 16 * 4;
+
+	private static final List<Hdf5TestCase> TEST_CASES = Collections.unmodifiableList(Arrays.asList(
+		new Hdf5TestCase(
+			"test_single_dataset.h5", "data", DmlVariant.FORMAT_AND_DATASET, S_2D_ROWS, S_2D_COLS),
+		new Hdf5TestCase(
+			"test_multiple_datasets.h5", "matrix_2d", DmlVariant.DATASET_ONLY, S_2D_ROWS, S_2D_COLS),
+		new Hdf5TestCase(
+			"test_multiple_datasets.h5", "matrix_3d", DmlVariant.DATASET_ONLY, MATRIX_3D_ROWS, MATRIX_3D_FLATTENED_COLS),
+		new Hdf5TestCase(
+			"test_different_dtypes.h5", "double_primary", DmlVariant.DATASET_ONLY, S_2D_ROWS, S_2D_COLS),
+		new Hdf5TestCase(
+			"test_chunked.h5", "chunked_data", DmlVariant.FORMAT_AND_DATASET, S_2D_ROWS, S_2D_COLS),
+		new Hdf5TestCase(
+			"test_compressed.h5", "gzip_compressed_9", DmlVariant.DATASET_ONLY, S_2D_ROWS, S_2D_COLS),
+		new Hdf5TestCase(
+			"test_multi_tensor_samples.h5", "label", DmlVariant.DATASET_ONLY, MULTI_TENSOR_SAMPLES, MULTI_TENSOR_LABEL_FEATURES),
+		new Hdf5TestCase(
+			"test_multi_tensor_samples.h5", "sen1", DmlVariant.DATASET_ONLY, MULTI_TENSOR_SAMPLES, MULTI_TENSOR_SEN1_FLATTENED_COLS),
+		new Hdf5TestCase(
+			"test_nested_groups.h5", "group1/subgroup/data2", DmlVariant.FORMAT_AND_DATASET, S_2D_ROWS, S_2D_COLS),
+		new Hdf5TestCase(
+			"test_with_attributes.h5", "data", DmlVariant.DATASET_ONLY, S_2D_ROWS, S_2D_COLS),
+		new Hdf5TestCase(
+			"test_empty_datasets.h5", "empty", DmlVariant.FORMAT_AND_DATASET, 0, S_2D_COLS),
+		new Hdf5TestCase(
+			"test_string_datasets.h5", "string_array", DmlVariant.DATASET_ONLY, S_ARRAY_LENGTH, 1)
+	));
+
+	private final Hdf5TestCase testCase;
+
+	public ReadHDF5Test(Hdf5TestCase testCase) {
+		this.testCase = testCase;
 	}
 
-	private final static double eps = 1e-9;
+	@BeforeClass
+	public static void ensureHdf5DataGenerated() {
+		Path scriptDir = Paths.get(SCRIPT_DIR, TEST_DIR);
+		Path inputDir = scriptDir.resolve(INPUT_DIR);
+		boolean missingFiles = TEST_CASES.stream()
+			.anyMatch(tc -> Files.notExists(inputDir.resolve(tc.hdf5File)));
+		if(!missingFiles)
+			ensureMetadataFiles(inputDir);
+		else {
+			generateHdf5Data(scriptDir);
 
-	@Test
-	public void testHDF51_Seq_CP() {
-		runReadHDF5Test(getId(), ExecMode.SINGLE_NODE, false);
+			boolean stillMissing = TEST_CASES.stream()
+				.anyMatch(tc -> Files.notExists(inputDir.resolve(tc.hdf5File)));
+			if(stillMissing)
+				Assert.fail("Failed to generate required HDF5 files for ReadHDF5 tests.");
+
+			ensureMetadataFiles(inputDir);
+		}
+	}
+
+	@Parameters(name = "{0}")
+	public static Collection<Object[]> data() {
+		return TEST_CASES.stream()
+			.map(tc -> new Object[] {tc})
+			.collect(Collectors.toList());
+	}
+
+	@Override
+	protected String getTestName() {
+		return TEST_NAME;
+	}
+
+	@Override
+	protected String getTestClassDir() {
+		return TEST_CLASS_DIR;
 	}
 
 	@Test
-	public void testHDF51_Parallel_CP() {
-		runReadHDF5Test(getId(), ExecMode.SINGLE_NODE, true);
+	public void testReadSequential() {
+		runReadHDF5Test(testCase, ExecMode.SINGLE_NODE, false);
 	}
 
-	protected void runReadHDF5Test(int testNumber, ExecMode platform, boolean parallel) {
+	@Test
+	public void testReadSequentialParallelIO() {
+		runReadHDF5Test(testCase, ExecMode.SINGLE_NODE, true);
+	}
 
+	protected void runReadHDF5Test(Hdf5TestCase testCase, ExecMode platform, boolean parallel) {
 		ExecMode oldPlatform = rtplatform;
 		rtplatform = platform;
 
@@ -61,21 +156,19 @@ protected void runReadHDF5Test(int testNumber, ExecMode platform, boolean parall
 		boolean oldpar = CompilerConfig.FLAG_PARREADWRITE_TEXT;
 
 		try {
-
 			CompilerConfig.FLAG_PARREADWRITE_TEXT = parallel;
 
 			TestConfiguration config = getTestConfiguration(getTestName());
 			loadTestConfiguration(config);
 
 			String HOME = SCRIPT_DIR + TEST_DIR;
-			String inputMatrixName = HOME + INPUT_DIR + getInputHDF5FileName(); // always read the same data
-			String datasetName = "DATASET_1";
+			String inputMatrixName = HOME + INPUT_DIR + testCase.hdf5File;
 
-			fullDMLScriptName = HOME + getTestName() + "_" + testNumber + ".dml";
-			programArgs = new String[] {"-args", inputMatrixName, datasetName, output("Y")};
+			fullDMLScriptName = HOME + testCase.variant.getScriptName();
+			programArgs = new String[] {"-args", inputMatrixName, testCase.dataset, output("Y")};
 
 			fullRScriptName = HOME + "ReadHDF5_Verify.R";
-			rCmd = "Rscript" + " " + fullRScriptName + " " + inputMatrixName + " " + datasetName + " " + expectedDir();
+			rCmd = "Rscript" + " " + fullRScriptName + " " + inputMatrixName + " " + testCase.dataset + " " + expectedDir();
 
 			runTest(true, false, null, -1);
 			runRScript(true);
@@ -90,4 +183,93 @@ protected void runReadHDF5Test(int testNumber, ExecMode platform, boolean parall
 			DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
 		}
 	}
+
+	private static void generateHdf5Data(Path scriptDir) {
+		ProcessBuilder processBuilder = new ProcessBuilder("Rscript", "gen_HDF5_testdata.R");
+		processBuilder.directory(scriptDir.toFile());
+		processBuilder.redirectErrorStream(true);
+
+		try {
+			Process process = processBuilder.start();
+			StringBuilder output = new StringBuilder();
+			try(BufferedReader reader = new BufferedReader(
+				new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) {
+				reader.lines().forEach(line -> output.append(line).append(System.lineSeparator()));
+			}
+			int exitCode = process.waitFor();
+			if(exitCode != 0)
+				Assert.fail("Failed to execute gen_HDF5_testdata.R (exit " + exitCode + "):\n" + output);
+		}
+		catch(IOException e) {
+			Assert.fail("Unable to execute gen_HDF5_testdata.R: " + e.getMessage());
+		}
+		catch(InterruptedException e) {
+			Thread.currentThread().interrupt();
+			Assert.fail("Interrupted while generating HDF5 test data.");
+		}
+	}
+
+	private static void ensureMetadataFiles(Path inputDir) {
+		try {
+			Files.createDirectories(inputDir);
+			for(Hdf5TestCase tc : TEST_CASES) {
+				Path mtdPath = inputDir.resolve(tc.getMtdFileName());
+				if(Files.exists(mtdPath))
+					continue;
+
+				MatrixCharacteristics mc = new MatrixCharacteristics(tc.rows, tc.cols, tc.getNonZeros());
+				HDFSTool.writeMetaDataFile(mtdPath.toString(), ValueType.FP64, mc, FileFormat.HDF5);
+			}
+		}
+		catch(IOException e) {
+			Assert.fail("Unable to create HDF5 metadata files: " + e.getMessage());
+		}
+	}
+
+	private enum DmlVariant {
+		FORMAT_AND_DATASET("ReadHDF5_WithFormatAndDataset.dml"),
+		DATASET_ONLY("ReadHDF5_WithDataset.dml"),
+		DEFAULT("ReadHDF5_Default.dml");
+
+		private final String scriptName;
+
+		DmlVariant(String scriptName) {
+			this.scriptName = scriptName;
+		}
+
+		public String getScriptName() {
+			return scriptName;
+		}
+	}
+
+	private static final class Hdf5TestCase {
+		private final String hdf5File;
+		private final String dataset;
+		private final DmlVariant variant;
+		private final long rows;
+		private final long cols;
+
+		private Hdf5TestCase(String hdf5File, String dataset, DmlVariant variant, long rows, long cols) {
+			this.hdf5File = hdf5File;
+			this.dataset = dataset;
+			this.variant = variant;
+			this.rows = rows;
+			this.cols = cols;
+		}
+
+		private String getMtdFileName() {
+			return hdf5File + ".mtd";
+		}
+
+		private long getNonZeros() {
+			if(rows == 0 || cols == 0)
+				return 0;
+			return rows * cols;
+		}
+
+		@Override
+		public String toString() {
+			return hdf5File + "::" + dataset;
+		}
+	}
 }
diff --git a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test1.java b/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test1.java
deleted file mode 100644
index b0fff7a6391..00000000000
--- a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test1.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysds.test.functions.io.hdf5;
-
-public class ReadHDF5Test1 extends ReadHDF5Test {
-
-	private final static String TEST_NAME = "ReadHDF5Test";
-	public final static String TEST_CLASS_DIR = TEST_DIR + ReadHDF5Test1.class.getSimpleName() + "/";
-
-	protected String getTestName() {
-		return TEST_NAME;
-	}
-
-	protected String getTestClassDir() {
-		return TEST_CLASS_DIR;
-	}
-
-	protected int getId() {
-		return 1;
-	}
-}
diff --git a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test2.java b/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test2.java
deleted file mode 100644
index d6a4c763c34..00000000000
--- a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test2.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysds.test.functions.io.hdf5;
-
-public class ReadHDF5Test2 extends ReadHDF5Test {
-
-	private final static String TEST_NAME = "ReadHDF5Test";
-	private final static String TEST_CLASS_DIR = TEST_DIR + ReadHDF5Test2.class.getSimpleName() + "/";
-
-	protected String getTestName() {
-		return TEST_NAME;
-	}
-
-	protected String getTestClassDir() {
-		return TEST_CLASS_DIR;
-	}
-
-	protected int getId() {
-		return 2;
-	}
-}
diff --git a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test3.java b/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test3.java
deleted file mode 100644
index 71a6b1762ec..00000000000
--- a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test3.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysds.test.functions.io.hdf5;
-
-public class ReadHDF5Test3 extends ReadHDF5Test {
-
-	private final static String TEST_NAME = "ReadHDF5Test";
-	private final static String TEST_CLASS_DIR = TEST_DIR + ReadHDF5Test3.class.getSimpleName() + "/";
-
-	protected String getTestName() {
-		return TEST_NAME;
-	}
-
-	protected String getTestClassDir() {
-		return TEST_CLASS_DIR;
-	}
-
-	protected int getId() {
-		return 3;
-	}
-}
diff --git a/src/test/scripts/functions/io/hdf5/ReadHDF5Test_3.dml b/src/test/scripts/functions/io/hdf5/ReadHDF5_Default.dml
similarity index 100%
rename from src/test/scripts/functions/io/hdf5/ReadHDF5Test_3.dml
rename to src/test/scripts/functions/io/hdf5/ReadHDF5_Default.dml
diff --git a/src/test/scripts/functions/io/hdf5/ReadHDF5Test_2.dml b/src/test/scripts/functions/io/hdf5/ReadHDF5_WithDataset.dml
similarity index 100%
rename from src/test/scripts/functions/io/hdf5/ReadHDF5Test_2.dml
rename to src/test/scripts/functions/io/hdf5/ReadHDF5_WithDataset.dml
diff --git a/src/test/scripts/functions/io/hdf5/ReadHDF5Test_1.dml b/src/test/scripts/functions/io/hdf5/ReadHDF5_WithFormatAndDataset.dml
similarity index 100%
rename from src/test/scripts/functions/io/hdf5/ReadHDF5Test_1.dml
rename to src/test/scripts/functions/io/hdf5/ReadHDF5_WithFormatAndDataset.dml
diff --git a/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R b/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R
index 22e94da1d41..a1f5c284270 100644
--- a/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R
+++ b/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R
@@ -29,26 +29,25 @@ if (!require("rhdf5", quietly = TRUE)) {
 }
 
 SMALL_MATRIX_2D <- c(200, 40)
-LARGE_MATRIX_2D <- c(800, 30)
 SMALL_MATRIX_3D <- c(15, 15, 5)
+SMALL_TENSOR_4D_A <- c(120, 16, 16, 4)
+SMALL_TENSOR_4D_B <- c(120, 16, 16, 5)
+SMALL_LABEL_MATRIX <- c(120, 12)
 
 VECTOR_LENGTH <- 200
 STRING_ARRAY_LENGTH <- 30
 
 CHUNK_SHAPE <- c(100, 20)
-LARGE_CHUNK_SHAPE <- c(200, 30)
-
-TENSOR_DIMS <- list(
-  samples = 120,
-  height = 16,
-  width = 16,
-  channels_a = 4,
-  channels_b = 5,
-  label_features = 12
-)
 
 write_matrix <- function(file_path, dataset_name, shape, generator = function(n) rnorm(n)) {
   values <- generator(prod(shape))
+  # Create dataset without compression, filters, or chunking to avoid message type 11 (Filter Pipeline)
+  # filter = "NONE": explicitly disable compression filters
+  # level = 0: no compression
+  # shuffle = FALSE: no shuffle filter
+  # chunk = dims: single chunk matching dataset size (effectively contiguous for small datasets)
+  h5createDataset(file_path, dataset_name, dims = shape, 
+                  filter = "NONE", level = 0, shuffle = FALSE, chunk = shape)
   h5write(array(values, dim = shape), file_path, dataset_name)
 }
 
@@ -63,6 +62,9 @@ generate_test_file_multiple_datasets <- function(dir) {
   file_path <- file.path(dir, "test_multiple_datasets.h5")
   h5createFile(file_path)
   write_matrix(file_path, "matrix_2d", SMALL_MATRIX_2D)
+  # Create 1D vector without compression/filters
+  h5createDataset(file_path, "vector_1d", dims = VECTOR_LENGTH, 
+                  filter = "NONE", level = 0, shuffle = FALSE, chunk = VECTOR_LENGTH)
   h5write(rnorm(VECTOR_LENGTH), file_path, "vector_1d")
   write_matrix(file_path, "matrix_3d", SMALL_MATRIX_3D)
   cat("Created test_multiple_datasets.h5 (1D/2D/3D datasets)\n")
@@ -94,7 +96,9 @@ generate_test_file_chunked <- function(dir) {
   h5createFile(file_path)
 
   data <- array(rnorm(prod(SMALL_MATRIX_2D)), dim = SMALL_MATRIX_2D)
-  h5createDataset(file_path, "chunked_data", dims = SMALL_MATRIX_2D, chunk = CHUNK_SHAPE)
+  # Chunked dataset without compression/filters (chunking is intentional for this test)
+  h5createDataset(file_path, "chunked_data", dims = SMALL_MATRIX_2D, chunk = CHUNK_SHAPE,
+                  filter = "NONE", level = 0, shuffle = FALSE)
   h5write(data, file_path, "chunked_data")
 
   write_matrix(file_path, "non_chunked_data", SMALL_MATRIX_2D)
@@ -120,17 +124,17 @@ generate_test_file_multi_tensor_samples <- function(dir) {
   write_matrix(
     file_path,
     "sen1",
-    c(TENSOR_DIMS$samples, TENSOR_DIMS$height, TENSOR_DIMS$width, TENSOR_DIMS$channels_a)
+    SMALL_TENSOR_4D_A
   )
   write_matrix(
     file_path,
     "sen2",
-    c(TENSOR_DIMS$samples, TENSOR_DIMS$height, TENSOR_DIMS$width, TENSOR_DIMS$channels_b)
+    SMALL_TENSOR_4D_B
   )
   write_matrix(
     file_path,
     "label",
-    c(TENSOR_DIMS$samples, TENSOR_DIMS$label_features),
+    SMALL_LABEL_MATRIX,
     generator = function(n) as.integer(sample(0:1, n, replace = TRUE))
   )
   cat("Created test_multi_tensor_samples.h5 (multi-input tensors)\n")
@@ -168,9 +172,15 @@ generate_test_file_with_attributes <- function(dir) {
 generate_test_file_empty_datasets <- function(dir) {
   file_path <- file.path(dir, "test_empty_datasets.h5")
   h5createFile(file_path)
-  h5createDataset(file_path, "empty", dims = c(0, SMALL_MATRIX_2D[2]))
+  h5createDataset(file_path, "empty", dims = c(0, SMALL_MATRIX_2D[2]), 
+                  filter = "NONE", level = 0, shuffle = FALSE)
 
+  # Create scalar and vector without compression/filters
+  h5createDataset(file_path, "scalar", dims = 1, 
+                  filter = "NONE", level = 0, shuffle = FALSE, chunk = 1)
   h5write(1.0, file_path, "scalar")
+  h5createDataset(file_path, "vector", dims = VECTOR_LENGTH, 
+                  filter = "NONE", level = 0, shuffle = FALSE, chunk = VECTOR_LENGTH)
   h5write(rnorm(VECTOR_LENGTH), file_path, "vector")
   cat("Created test_empty_datasets.h5 (empty/scalar/vector)\n")
 }
@@ -179,6 +189,10 @@ generate_test_file_string_datasets <- function(dir) {
   file_path <- file.path(dir, "test_string_datasets.h5")
   h5createFile(file_path)
   strings <- paste0("string_", 0:(STRING_ARRAY_LENGTH - 1))
+  # Create string dataset without compression/filters
+  h5createDataset(file_path, "string_array", dims = STRING_ARRAY_LENGTH, 
+                  storage.mode = "character", filter = "NONE", level = 0, 
+                  shuffle = FALSE, chunk = STRING_ARRAY_LENGTH)
   h5write(strings, file_path, "string_array")
   cat("Created test_string_datasets.h5 (string datasets)\n")
 }

From f6c833d59ead3d168ded936acde15eeb799c9ce4 Mon Sep 17 00:00:00 2001
From: Lucca Di Benedetto <luccadibe@gmail.com>
Date: Mon, 5 Jan 2026 18:20:06 +0100
Subject: [PATCH 3/4] clean up test

---
 .../test/functions/io/hdf5/ReadHDF5Test.java  | 147 ++++--------------
 .../functions/io/hdf5/ReadHDF5_Verify.R       |  18 ++-
 .../functions/io/hdf5/gen_HDF5_testdata.R     |  46 +++---
 3 files changed, 70 insertions(+), 141 deletions(-)

diff --git a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java b/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java
index c1a7d37b986..bd6af2fe066 100644
--- a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java
+++ b/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java
@@ -23,107 +23,37 @@
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
+import java.io.File;
+import org.apache.commons.io.FileUtils;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.Arrays;
-import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
-import java.util.stream.Collectors;
 
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.common.Types.ExecMode;
-import org.apache.sysds.common.Types.FileFormat;
-import org.apache.sysds.common.Types.ValueType;
 import org.apache.sysds.conf.CompilerConfig;
 import org.apache.sysds.runtime.matrix.data.MatrixValue;
-import org.apache.sysds.runtime.meta.MatrixCharacteristics;
-import org.apache.sysds.runtime.util.HDFSTool;
 import org.apache.sysds.test.TestConfiguration;
 import org.apache.sysds.test.TestUtils;
 import org.junit.Assert;
 import org.junit.BeforeClass;
 import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
 
-@RunWith(Parameterized.class)
 public class ReadHDF5Test extends ReadHDF5TestBase {
 
 	private static final double eps = 1e-9;
 	private static final String TEST_NAME = "ReadHDF5Test";
 
-	private static final int S_2D_ROWS = 200;
-	private static final int S_2D_COLS = 40;
-	private static final int S_ARRAY_LENGTH = 30;
-	private static final int MATRIX_3D_ROWS = 15;
-	private static final int MATRIX_3D_FLATTENED_COLS = 15 * 5;
-	private static final int MULTI_TENSOR_SAMPLES = 120;
-	private static final int MULTI_TENSOR_LABEL_FEATURES = 12;
-	private static final int MULTI_TENSOR_SEN1_FLATTENED_COLS = 16 * 16 * 4;
-
-	private static final List<Hdf5TestCase> TEST_CASES = Collections.unmodifiableList(Arrays.asList(
-		new Hdf5TestCase(
-			"test_single_dataset.h5", "data", DmlVariant.FORMAT_AND_DATASET, S_2D_ROWS, S_2D_COLS),
-		new Hdf5TestCase(
-			"test_multiple_datasets.h5", "matrix_2d", DmlVariant.DATASET_ONLY, S_2D_ROWS, S_2D_COLS),
-		new Hdf5TestCase(
-			"test_multiple_datasets.h5", "matrix_3d", DmlVariant.DATASET_ONLY, MATRIX_3D_ROWS, MATRIX_3D_FLATTENED_COLS),
-		new Hdf5TestCase(
-			"test_different_dtypes.h5", "double_primary", DmlVariant.DATASET_ONLY, S_2D_ROWS, S_2D_COLS),
-		new Hdf5TestCase(
-			"test_chunked.h5", "chunked_data", DmlVariant.FORMAT_AND_DATASET, S_2D_ROWS, S_2D_COLS),
-		new Hdf5TestCase(
-			"test_compressed.h5", "gzip_compressed_9", DmlVariant.DATASET_ONLY, S_2D_ROWS, S_2D_COLS),
-		new Hdf5TestCase(
-			"test_multi_tensor_samples.h5", "label", DmlVariant.DATASET_ONLY, MULTI_TENSOR_SAMPLES, MULTI_TENSOR_LABEL_FEATURES),
-		new Hdf5TestCase(
-			"test_multi_tensor_samples.h5", "sen1", DmlVariant.DATASET_ONLY, MULTI_TENSOR_SAMPLES, MULTI_TENSOR_SEN1_FLATTENED_COLS),
-		new Hdf5TestCase(
-			"test_nested_groups.h5", "group1/subgroup/data2", DmlVariant.FORMAT_AND_DATASET, S_2D_ROWS, S_2D_COLS),
-		new Hdf5TestCase(
-			"test_with_attributes.h5", "data", DmlVariant.DATASET_ONLY, S_2D_ROWS, S_2D_COLS),
-		new Hdf5TestCase(
-			"test_empty_datasets.h5", "empty", DmlVariant.FORMAT_AND_DATASET, 0, S_2D_COLS),
-		new Hdf5TestCase(
-			"test_string_datasets.h5", "string_array", DmlVariant.DATASET_ONLY, S_ARRAY_LENGTH, 1)
-	));
-
-	private final Hdf5TestCase testCase;
-
-	public ReadHDF5Test(Hdf5TestCase testCase) {
-		this.testCase = testCase;
-	}
-
-	@BeforeClass
-	public static void ensureHdf5DataGenerated() {
-		Path scriptDir = Paths.get(SCRIPT_DIR, TEST_DIR);
-		Path inputDir = scriptDir.resolve(INPUT_DIR);
-		boolean missingFiles = TEST_CASES.stream()
-			.anyMatch(tc -> Files.notExists(inputDir.resolve(tc.hdf5File)));
-		if(!missingFiles)
-			ensureMetadataFiles(inputDir);
-		else {
-			generateHdf5Data(scriptDir);
-
-			boolean stillMissing = TEST_CASES.stream()
-				.anyMatch(tc -> Files.notExists(inputDir.resolve(tc.hdf5File)));
-			if(stillMissing)
-				Assert.fail("Failed to generate required HDF5 files for ReadHDF5 tests.");
-
-			ensureMetadataFiles(inputDir);
-		}
-	}
-
-	@Parameters(name = "{0}")
-	public static Collection<Object[]> data() {
-		return TEST_CASES.stream()
-			.map(tc -> new Object[] {tc})
-			.collect(Collectors.toList());
-	}
+	private static final List<Hdf5TestCase> TEST_CASES = Collections.unmodifiableList(
+		Arrays.asList(new Hdf5TestCase("test_single_dataset.h5", "data", DmlVariant.FORMAT_AND_DATASET),
+			new Hdf5TestCase("test_multiple_datasets.h5", "matrix_2d", DmlVariant.DATASET_ONLY),
+			new Hdf5TestCase("test_multiple_datasets.h5", "matrix_3d", DmlVariant.DATASET_ONLY),
+			new Hdf5TestCase("test_multi_tensor_samples.h5", "label", DmlVariant.DATASET_ONLY),
+			new Hdf5TestCase("test_multi_tensor_samples.h5", "sen1", DmlVariant.DATASET_ONLY),
+			new Hdf5TestCase("test_nested_groups.h5", "group1/subgroup/data2", DmlVariant.FORMAT_AND_DATASET)));
 
 	@Override
 	protected String getTestName() {
@@ -135,14 +65,22 @@ protected String getTestClassDir() {
 		return TEST_CLASS_DIR;
 	}
 
+	@BeforeClass
+	public static void setUpClass() {
+		Path scriptDir = Paths.get(SCRIPT_DIR + TEST_DIR);
+		generateHdf5Data(scriptDir);
+	}
+
 	@Test
 	public void testReadSequential() {
-		runReadHDF5Test(testCase, ExecMode.SINGLE_NODE, false);
+		for(Hdf5TestCase tc : TEST_CASES)
+			runReadHDF5Test(tc, ExecMode.SINGLE_NODE, false);
 	}
 
 	@Test
 	public void testReadSequentialParallelIO() {
-		runReadHDF5Test(testCase, ExecMode.SINGLE_NODE, true);
+		for(Hdf5TestCase tc : TEST_CASES)
+			runReadHDF5Test(tc, ExecMode.SINGLE_NODE, true);
 	}
 
 	protected void runReadHDF5Test(Hdf5TestCase testCase, ExecMode platform, boolean parallel) {
@@ -167,8 +105,17 @@ protected void runReadHDF5Test(Hdf5TestCase testCase, ExecMode platform, boolean
 			fullDMLScriptName = HOME + testCase.variant.getScriptName();
 			programArgs = new String[] {"-args", inputMatrixName, testCase.dataset, output("Y")};
 
+			// Clean per-case output/expected to avoid reusing stale metadata between looped cases
+			String outY = output("Y");
+			String expY = expected("Y");
+			FileUtils.deleteQuietly(new File(outY));
+			FileUtils.deleteQuietly(new File(outY + ".mtd"));
+			FileUtils.deleteQuietly(new File(expY));
+			FileUtils.deleteQuietly(new File(expY + ".mtd"));
+
 			fullRScriptName = HOME + "ReadHDF5_Verify.R";
-			rCmd = "Rscript" + " " + fullRScriptName + " " + inputMatrixName + " " + testCase.dataset + " " + expectedDir();
+			rCmd = "Rscript" + " " + fullRScriptName + " " + inputMatrixName + " " + testCase.dataset + " "
+				+ expectedDir();
 
 			runTest(true, false, null, -1);
 			runRScript(true);
@@ -209,26 +156,8 @@ private static void generateHdf5Data(Path scriptDir) {
 		}
 	}
 
-	private static void ensureMetadataFiles(Path inputDir) {
-		try {
-			Files.createDirectories(inputDir);
-			for(Hdf5TestCase tc : TEST_CASES) {
-				Path mtdPath = inputDir.resolve(tc.getMtdFileName());
-				if(Files.exists(mtdPath))
-					continue;
-
-				MatrixCharacteristics mc = new MatrixCharacteristics(tc.rows, tc.cols, tc.getNonZeros());
-				HDFSTool.writeMetaDataFile(mtdPath.toString(), ValueType.FP64, mc, FileFormat.HDF5);
-			}
-		}
-		catch(IOException e) {
-			Assert.fail("Unable to create HDF5 metadata files: " + e.getMessage());
-		}
-	}
-
 	private enum DmlVariant {
-		FORMAT_AND_DATASET("ReadHDF5_WithFormatAndDataset.dml"),
-		DATASET_ONLY("ReadHDF5_WithDataset.dml"),
+		FORMAT_AND_DATASET("ReadHDF5_WithFormatAndDataset.dml"), DATASET_ONLY("ReadHDF5_WithDataset.dml"),
 		DEFAULT("ReadHDF5_Default.dml");
 
 		private final String scriptName;
@@ -246,25 +175,11 @@ private static final class Hdf5TestCase {
 		private final String hdf5File;
 		private final String dataset;
 		private final DmlVariant variant;
-		private final long rows;
-		private final long cols;
 
-		private Hdf5TestCase(String hdf5File, String dataset, DmlVariant variant, long rows, long cols) {
+		private Hdf5TestCase(String hdf5File, String dataset, DmlVariant variant) {
 			this.hdf5File = hdf5File;
 			this.dataset = dataset;
 			this.variant = variant;
-			this.rows = rows;
-			this.cols = cols;
-		}
-
-		private String getMtdFileName() {
-			return hdf5File + ".mtd";
-		}
-
-		private long getNonZeros() {
-			if(rows == 0 || cols == 0)
-				return 0;
-			return rows * cols;
 		}
 
 		@Override
diff --git a/src/test/scripts/functions/io/hdf5/ReadHDF5_Verify.R b/src/test/scripts/functions/io/hdf5/ReadHDF5_Verify.R
index 2b977007dd2..925e092f724 100644
--- a/src/test/scripts/functions/io/hdf5/ReadHDF5_Verify.R
+++ b/src/test/scripts/functions/io/hdf5/ReadHDF5_Verify.R
@@ -26,5 +26,19 @@ options(digits=22)
 
 library("rhdf5")
 
-Y = h5read(args[1],args[2],native = TRUE)
-writeMM(as(Y, "CsparseMatrix"), paste(args[3], "Y", sep=""))
+Y = h5read(args[1], args[2], native = TRUE)
+dims = dim(Y)
+
+if(length(dims) == 1) {
+  # convert to a column matrix
+  Y_mat = matrix(Y, ncol = 1)
+} else if(length(dims) > 2) {
+  # flatten everything beyond the first dimension into columns
+  perm = c(1, rev(seq(2, length(dims))))
+  Y_mat = matrix(aperm(Y, perm), nrow = dims[1], ncol = prod(dims[-1]))
+} else {
+  # for 2d , systemds treats it the same
+  Y_mat = Y
+}
+
+writeMM(as(Y_mat, "CsparseMatrix"), paste(args[3], "Y", sep=""))
diff --git a/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R b/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R
index a1f5c284270..7d4f56c8811 100644
--- a/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R
+++ b/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-#
+# 
 #   http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -41,14 +41,17 @@ CHUNK_SHAPE <- c(100, 20)
 
 write_matrix <- function(file_path, dataset_name, shape, generator = function(n) rnorm(n)) {
   values <- generator(prod(shape))
-  # Create dataset without compression, filters, or chunking to avoid message type 11 (Filter Pipeline)
-  # filter = "NONE": explicitly disable compression filters
-  # level = 0: no compression
-  # shuffle = FALSE: no shuffle filter
-  # chunk = dims: single chunk matching dataset size (effectively contiguous for small datasets)
-  h5createDataset(file_path, dataset_name, dims = shape, 
-                  filter = "NONE", level = 0, shuffle = FALSE, chunk = shape)
-  h5write(array(values, dim = shape), file_path, dataset_name)
+  h5createDataset(
+    file_path,
+    dataset_name,
+    dims = rev(shape),
+    chunk = NULL,
+    filter = "NONE", # contiguous, uncompressed layout
+    level = 0,
+    shuffle = FALSE,
+    native = TRUE # use R column-major order, same in h5read(..., native=TRUE) in tests.
+  )
+  h5write(array(values, dim = shape), file_path, dataset_name, native = TRUE)
 }
 
 generate_test_file_single_dataset <- function(dir) {
@@ -63,9 +66,8 @@ generate_test_file_multiple_datasets <- function(dir) {
   h5createFile(file_path)
   write_matrix(file_path, "matrix_2d", SMALL_MATRIX_2D)
   # Create 1D vector without compression/filters
-  h5createDataset(file_path, "vector_1d", dims = VECTOR_LENGTH, 
-                  filter = "NONE", level = 0, shuffle = FALSE, chunk = VECTOR_LENGTH)
-  h5write(rnorm(VECTOR_LENGTH), file_path, "vector_1d")
+  h5createDataset(file_path, "vector_1d", dims = VECTOR_LENGTH, chunk = NULL, filter = "NONE", level = 0, shuffle = FALSE)
+  h5write(rnorm(VECTOR_LENGTH), file_path, "vector_1d", native = TRUE)
   write_matrix(file_path, "matrix_3d", SMALL_MATRIX_3D)
   cat("Created test_multiple_datasets.h5 (1D/2D/3D datasets)\n")
 }
@@ -96,10 +98,10 @@ generate_test_file_chunked <- function(dir) {
   h5createFile(file_path)
 
   data <- array(rnorm(prod(SMALL_MATRIX_2D)), dim = SMALL_MATRIX_2D)
-  # Chunked dataset without compression/filters (chunking is intentional for this test)
+  
   h5createDataset(file_path, "chunked_data", dims = SMALL_MATRIX_2D, chunk = CHUNK_SHAPE,
                   filter = "NONE", level = 0, shuffle = FALSE)
-  h5write(data, file_path, "chunked_data")
+  h5write(data, file_path, "chunked_data", native = TRUE)
 
   write_matrix(file_path, "non_chunked_data", SMALL_MATRIX_2D)
   cat("Created test_chunked.h5 (chunked dataset)\n")
@@ -111,10 +113,10 @@ generate_test_file_compressed <- function(dir) {
   data <- array(rnorm(prod(SMALL_MATRIX_2D)), dim = SMALL_MATRIX_2D)
   h5createDataset(file_path, "gzip_compressed_9", dims = SMALL_MATRIX_2D, 
                   chunk = SMALL_MATRIX_2D, level = 9)
-  h5write(data, file_path, "gzip_compressed_9")
+  h5write(data, file_path, "gzip_compressed_9", native = TRUE)
   h5createDataset(file_path, "gzip_compressed_1", dims = SMALL_MATRIX_2D, 
                   chunk = SMALL_MATRIX_2D, level = 1)
-  h5write(data, file_path, "gzip_compressed_1")
+  h5write(data, file_path, "gzip_compressed_1", native = TRUE)
   cat("Created test_compressed.h5 (gzip compression)\n")
 }
 
@@ -175,13 +177,12 @@ generate_test_file_empty_datasets <- function(dir) {
   h5createDataset(file_path, "empty", dims = c(0, SMALL_MATRIX_2D[2]), 
                   filter = "NONE", level = 0, shuffle = FALSE)
 
-  # Create scalar and vector without compression/filters
   h5createDataset(file_path, "scalar", dims = 1, 
                   filter = "NONE", level = 0, shuffle = FALSE, chunk = 1)
-  h5write(1.0, file_path, "scalar")
+  h5write(1.0, file_path, "scalar", native = TRUE)
   h5createDataset(file_path, "vector", dims = VECTOR_LENGTH, 
                   filter = "NONE", level = 0, shuffle = FALSE, chunk = VECTOR_LENGTH)
-  h5write(rnorm(VECTOR_LENGTH), file_path, "vector")
+  h5write(rnorm(VECTOR_LENGTH), file_path, "vector", native = TRUE)
   cat("Created test_empty_datasets.h5 (empty/scalar/vector)\n")
 }
 
@@ -193,14 +194,13 @@ generate_test_file_string_datasets <- function(dir) {
   h5createDataset(file_path, "string_array", dims = STRING_ARRAY_LENGTH, 
                   storage.mode = "character", filter = "NONE", level = 0, 
                   shuffle = FALSE, chunk = STRING_ARRAY_LENGTH)
-  h5write(strings, file_path, "string_array")
+  h5write(strings, file_path, "string_array", native = TRUE)
   cat("Created test_string_datasets.h5 (string datasets)\n")
 }
 
 main <- function() {
-  # Check if working directory is "hdf5". Quit if not.
   if (basename(getwd()) != "hdf5") {
-    cat("You must execute this script from the 'hdf5' directory!\n")
+    cat("You must execute this script from the 'hdf5' directory\n")
     quit(status = 1)
   }
   

From be9247fdcc555dd2db190a40b26c8268ff19089a Mon Sep 17 00:00:00 2001
From: Lucca Di Benedetto <luccadibe@gmail.com>
Date: Mon, 5 Jan 2026 18:21:10 +0100
Subject: [PATCH 4/4] fix datatype test input

---
 .../functions/io/hdf5/gen_HDF5_testdata.R     | 52 +++++++++++--------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R b/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R
index 7d4f56c8811..fb9fed140ab 100644
--- a/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R
+++ b/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -39,7 +39,7 @@ STRING_ARRAY_LENGTH <- 30
 
 CHUNK_SHAPE <- c(100, 20)
 
-write_matrix <- function(file_path, dataset_name, shape, generator = function(n) rnorm(n)) {
+write_matrix <- function(file_path, dataset_name, shape, generator = function(n) rnorm(n), storage.mode = "double", H5type = NULL) {
   values <- generator(prod(shape))
   h5createDataset(
     file_path,
@@ -49,6 +49,8 @@ write_matrix <- function(file_path, dataset_name, shape, generator = function(n)
     filter = "NONE", # contiguous, uncompressed layout
     level = 0,
     shuffle = FALSE,
+    storage.mode = storage.mode,
+    H5type = H5type,
     native = TRUE # use R column-major order, same in h5read(..., native=TRUE) in tests.
   )
   h5write(array(values, dim = shape), file_path, dataset_name, native = TRUE)
@@ -75,21 +77,27 @@ generate_test_file_multiple_datasets <- function(dir) {
 generate_test_file_different_dtypes <- function(dir) {
   file_path <- file.path(dir, "test_different_dtypes.h5")
   h5createFile(file_path)
-  write_matrix(file_path, "double_primary", SMALL_MATRIX_2D)
-  write_matrix(file_path, "double_secondary", SMALL_MATRIX_2D)
+  # H5T_IEEE_F64LE (64-bit float)
+  write_matrix(file_path, "double_primary", SMALL_MATRIX_2D, storage.mode = "double")
+  # H5T_IEEE_F32LE (32-bit float)
+  write_matrix(file_path, "float32", SMALL_MATRIX_2D, H5type = "H5T_IEEE_F32LE")
+  # H5T_STD_I32LE (32-bit integer)
   write_matrix(
     file_path,
     "int32",
     SMALL_MATRIX_2D,
-    generator = function(n) as.integer(sample(-100:100, n, replace = TRUE))
+    generator = function(n) as.integer(sample(-100:100, n, replace = TRUE)),
+    storage.mode = "integer"
   )
+  # H5T_STD_I64LE (64-bit integer)
   write_matrix(
     file_path,
-    "int32_alt",
+    "int64",
     SMALL_MATRIX_2D,
-    generator = function(n) as.integer(sample(-100:100, n, replace = TRUE))
+    generator = function(n) as.integer(sample(-100:100, n, replace = TRUE)),
+    H5type = "H5T_STD_I64LE"
   )
-  cat("Created test_different_dtypes.h5 (double/int datasets)\n")
+  cat("Created test_different_dtypes.h5 (double/float/int32/int64 datasets)\n")
 }
 
 # https://support.hdfgroup.org/documentation/hdf5-docs/advanced_topics/chunking_in_hdf5.html
@@ -98,7 +106,7 @@ generate_test_file_chunked <- function(dir) {
   h5createFile(file_path)
 
   data <- array(rnorm(prod(SMALL_MATRIX_2D)), dim = SMALL_MATRIX_2D)
-  
+
   h5createDataset(file_path, "chunked_data", dims = SMALL_MATRIX_2D, chunk = CHUNK_SHAPE,
                   filter = "NONE", level = 0, shuffle = FALSE)
   h5write(data, file_path, "chunked_data", native = TRUE)
@@ -111,10 +119,10 @@ generate_test_file_compressed <- function(dir) {
   file_path <- file.path(dir, "test_compressed.h5")
   h5createFile(file_path)
   data <- array(rnorm(prod(SMALL_MATRIX_2D)), dim = SMALL_MATRIX_2D)
-  h5createDataset(file_path, "gzip_compressed_9", dims = SMALL_MATRIX_2D, 
+  h5createDataset(file_path, "gzip_compressed_9", dims = SMALL_MATRIX_2D,
                   chunk = SMALL_MATRIX_2D, level = 9)
   h5write(data, file_path, "gzip_compressed_9", native = TRUE)
-  h5createDataset(file_path, "gzip_compressed_1", dims = SMALL_MATRIX_2D, 
+  h5createDataset(file_path, "gzip_compressed_1", dims = SMALL_MATRIX_2D,
                   chunk = SMALL_MATRIX_2D, level = 1)
   h5write(data, file_path, "gzip_compressed_1", native = TRUE)
   cat("Created test_compressed.h5 (gzip compression)\n")
@@ -174,13 +182,13 @@ generate_test_file_with_attributes <- function(dir) {
 generate_test_file_empty_datasets <- function(dir) {
   file_path <- file.path(dir, "test_empty_datasets.h5")
   h5createFile(file_path)
-  h5createDataset(file_path, "empty", dims = c(0, SMALL_MATRIX_2D[2]), 
+  h5createDataset(file_path, "empty", dims = c(0, SMALL_MATRIX_2D[2]),
                   filter = "NONE", level = 0, shuffle = FALSE)
 
-  h5createDataset(file_path, "scalar", dims = 1, 
+  h5createDataset(file_path, "scalar", dims = 1,
                   filter = "NONE", level = 0, shuffle = FALSE, chunk = 1)
   h5write(1.0, file_path, "scalar", native = TRUE)
-  h5createDataset(file_path, "vector", dims = VECTOR_LENGTH, 
+  h5createDataset(file_path, "vector", dims = VECTOR_LENGTH,
                   filter = "NONE", level = 0, shuffle = FALSE, chunk = VECTOR_LENGTH)
   h5write(rnorm(VECTOR_LENGTH), file_path, "vector", native = TRUE)
   cat("Created test_empty_datasets.h5 (empty/scalar/vector)\n")
@@ -191,8 +199,8 @@ generate_test_file_string_datasets <- function(dir) {
   h5createFile(file_path)
   strings <- paste0("string_", 0:(STRING_ARRAY_LENGTH - 1))
   # Create string dataset without compression/filters
-  h5createDataset(file_path, "string_array", dims = STRING_ARRAY_LENGTH, 
-                  storage.mode = "character", filter = "NONE", level = 0, 
+  h5createDataset(file_path, "string_array", dims = STRING_ARRAY_LENGTH,
+                  storage.mode = "character", filter = "NONE", level = 0,
                   shuffle = FALSE, chunk = STRING_ARRAY_LENGTH)
   h5write(strings, file_path, "string_array", native = TRUE)
   cat("Created test_string_datasets.h5 (string datasets)\n")
@@ -203,12 +211,12 @@ main <- function() {
     cat("You must execute this script from the 'hdf5' directory\n")
     quit(status = 1)
   }
-  
+
   testdir <- "in"
   if (!dir.exists(testdir)) {
     dir.create(testdir)
   }
-  
+
   test_functions <- list(
     generate_test_file_single_dataset,
     generate_test_file_multiple_datasets,
@@ -221,7 +229,7 @@ main <- function() {
     generate_test_file_empty_datasets,
     generate_test_file_string_datasets
   )
-  
+
   for (test_func in test_functions) {
     tryCatch({
       test_func(testdir)
@@ -229,11 +237,11 @@ main <- function() {
       cat(sprintf("  ✗ Error: %s\n", conditionMessage(e)))
     })
   }
-  
+
   files <- sort(list.files(testdir, pattern = "\\.h5$", full.names = TRUE))
   cat(sprintf("\nGenerated %d HDF5 test files in %s\n", length(files), normalizePath(testdir)))
 }
 
 if (!interactive()) {
   main()
-}
\ No newline at end of file
+}