From a0b9aa94da09f02b34d28e3f990e221ae6dc0f8b Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 23 Oct 2025 19:59:37 +0000 Subject: [PATCH 1/3] Pass 1 of numpy ingestion --- src/mdio/__init__.py | 3 + src/mdio/converters/numpy.py | 297 +++++++++++++++++++++++++ tests/integration/test_import_numpy.py | 121 ++++++++++ 3 files changed, 421 insertions(+) create mode 100644 src/mdio/converters/numpy.py create mode 100644 tests/integration/test_import_numpy.py diff --git a/src/mdio/__init__.py b/src/mdio/__init__.py index 5fed389c8..aea44383e 100644 --- a/src/mdio/__init__.py +++ b/src/mdio/__init__.py @@ -12,11 +12,14 @@ except metadata.PackageNotFoundError: __version__ = "unknown" +# Import numpy_to_mdio after __version__ is set to avoid circular import +from mdio.converters.numpy import numpy_to_mdio __all__ = [ "__version__", "open_mdio", "to_mdio", "mdio_to_segy", + "numpy_to_mdio", "segy_to_mdio", ] diff --git a/src/mdio/converters/numpy.py b/src/mdio/converters/numpy.py new file mode 100644 index 000000000..1b04fbd7b --- /dev/null +++ b/src/mdio/converters/numpy.py @@ -0,0 +1,297 @@ +"""Conversion from Numpy to MDIO v1 format.""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +import numpy as np + +from mdio.api.io import _normalize_path +from mdio.api.io import to_mdio +from mdio.builder.schemas.chunk_grid import RegularChunkGrid +from mdio.builder.schemas.chunk_grid import RegularChunkShape +from mdio.builder.schemas.compressors import Blosc +from mdio.builder.schemas.compressors import BloscCname +from mdio.builder.schemas.dtype import ScalarType +from mdio.builder.schemas.v1.variable import VariableMetadata +from mdio.converters.type_converter import to_scalar_type +from mdio.builder.templates.base import AbstractDatasetTemplate +from mdio.builder.xarray_builder import to_xarray_dataset +from mdio.core.dimension import Dimension +from mdio.core.grid import Grid +from mdio.core.utils_write import MAX_COORDINATES_BYTES +from mdio.core.utils_write import MAX_SIZE_LIVE_MASK +from mdio.core.utils_write import get_constrained_chunksize + +if TYPE_CHECKING: + from pathlib import Path + from typing import Any + + from numpy.typing import DTypeLike + from numpy.typing import NDArray + from upath import UPath + from xarray import Dataset as xr_Dataset + + from mdio.builder.schemas import Dataset + +logger = logging.getLogger(__name__) + + +def get_compressor(lossless: bool, compression_tolerance: float) -> list: + """Get compressor configuration based on compression settings.""" + if lossless: + return [Blosc(cname=BloscCname.zstd)] + else: + # For lossy compression, we would need ZFP, but let's keep it simple for now + # and use lossless as fallback + logger.warning("Lossy compression not yet implemented, using lossless") + return [Blosc(cname=BloscCname.zstd)] + + +def _prepare_inputs( + array: NDArray, + mdio_template: AbstractDatasetTemplate, + chunksize: tuple[int, ...] | None, + index_coords: dict[str, NDArray] | None, +) -> tuple[tuple[int, ...], dict[str, NDArray]]: + """Prepare inputs and set defaults for chunksize and coordinates.""" + dim_names = mdio_template.dimension_names + + # Use template's chunk shape if not provided + if chunksize is None: + chunksize = mdio_template.full_chunk_shape + + # Create default coordinates if not provided + if index_coords is None: + index_coords = {} + for name, size in zip(dim_names, array.shape, strict=True): + index_coords[name] = np.arange(size, dtype=np.int32) + + return chunksize, index_coords + + +def _build_grid_and_dataset( + array: NDArray, + mdio_template: AbstractDatasetTemplate, + chunksize: tuple[int, ...], + index_coords: dict[str, NDArray], + lossless: bool, + compression_tolerance: float, + header_dtype: DTypeLike | None, +) -> tuple[Grid, Dataset]: + """Build the grid and dataset for the numpy array using the provided template.""" + # Create dimensions + dims = [Dimension(name=name, coords=index_coords[name]) for name in mdio_template.dimension_names] + grid = Grid(dims=dims) + + # Get compressor + compressors = get_compressor(lossless, compression_tolerance) + + # Convert numpy dtype to MDIO ScalarType + data_type = to_scalar_type(array.dtype) + + # Build dataset + mdio_ds: Dataset = mdio_template.build_dataset( + name=mdio_template.name, + sizes=array.shape, + header_dtype=header_dtype, + ) + + # Update the default variable with correct dtype and compressor + var_index = next((i for i, v in enumerate(mdio_ds.variables) if v.name == mdio_template.default_variable_name), None) + if var_index is not None: + mdio_ds.variables[var_index].data_type = data_type + mdio_ds.variables[var_index].compressor = compressors[0] + + # Set chunk grid for the data variable + chunk_grid = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=chunksize)) + if mdio_ds.variables[var_index].metadata is None: + mdio_ds.variables[var_index].metadata = VariableMetadata() + mdio_ds.variables[var_index].metadata.chunk_grid = chunk_grid + + # Dynamically chunk the trace_mask + _chunk_variable(ds=mdio_ds, target_variable_name="trace_mask") + + # Dynamically chunk coordinate variables + for coord in mdio_template.coordinate_names: + _chunk_variable(ds=mdio_ds, target_variable_name=coord) + + return grid, mdio_ds + + +def _chunk_variable(ds: Dataset, target_variable_name: str) -> None: + """Determines and sets the chunking for a specific Variable in the Dataset.""" + # Find variable index by name + index = next((i for i, obj in enumerate(ds.variables) if obj.name == target_variable_name), None) + if index is None: + return + + def determine_target_size(var_type: str) -> int: + """Determines the target size (in bytes) for a Variable based on its type.""" + if var_type == "bool": + return MAX_SIZE_LIVE_MASK + return MAX_COORDINATES_BYTES + + # Create the chunk grid metadata + var_type = ds.variables[index].data_type + full_shape = tuple(dim.size for dim in ds.variables[index].dimensions) + target_size = determine_target_size(var_type) + + chunk_shape = get_constrained_chunksize(full_shape, var_type, target_size) + chunk_grid = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=chunk_shape)) + + # Create variable metadata if it doesn't exist + if ds.variables[index].metadata is None: + ds.variables[index].metadata = VariableMetadata() + + ds.variables[index].metadata.chunk_grid = chunk_grid + + +def _populate_coordinates_and_write( + xr_dataset: xr_Dataset, + grid: Grid, + output_path: UPath | Path, + array: NDArray, +) -> None: + """Populate coordinates and write data to MDIO.""" + # Populate dimension coordinates + drop_vars_delayed = [] + for dim in grid.dims: + xr_dataset[dim.name].values[:] = dim.coords + drop_vars_delayed.append(dim.name) + + # Set trace mask to all True (no missing data for numpy arrays) + xr_dataset.trace_mask.data[:] = True + drop_vars_delayed.append("trace_mask") + + # Create data dataset with the numpy array + data_var_name = xr_dataset.attrs.get("defaultVariableName", "amplitude") + data_ds = xr_dataset[[data_var_name]].copy() + data_ds[data_var_name].data[:] = array + + # Combine all datasets + full_ds = xr_dataset[drop_vars_delayed].merge(data_ds) + + # Write everything at once + to_mdio(full_ds, output_path=output_path, mode="w", compute=True) + + +def numpy_to_mdio( # noqa: PLR0913 + array: NDArray, + mdio_template: AbstractDatasetTemplate, + output_path: UPath | Path | str, + chunksize: tuple[int, ...] | None = None, + index_coords: dict[str, NDArray] | None = None, + header_dtype: DTypeLike | None = None, + lossless: bool = True, + compression_tolerance: float = 0.01, + storage_options: dict[str, Any] | None = None, + overwrite: bool = False, +) -> None: + """Convert a NumPy array to MDIO v1 format. + + This function converts a NumPy array into the MDIO format following the same + interface pattern as SEG-Y to MDIO conversion. + + Args: + array: Input NumPy array to be converted to MDIO format. + mdio_template: The MDIO template to use for the conversion. The template defines + the dataset structure, but coordinate information is ignored for NumPy arrays. + output_path: The universal path for the output MDIO v1 file. + chunksize: Tuple specifying the chunk sizes for each dimension of the array. If not + provided, uses the template's default chunk shape. Must match the number of + dimensions in the input array. + index_coords: Dictionary mapping dimension names to their coordinate arrays. If not + provided, defaults to sequential integers (0 to size-1) for each dimension. + header_dtype: Data type for trace headers, if applicable. Defaults to None. + lossless: If True, uses lossless Blosc compression with zstandard. If False, uses ZFP lossy + compression (requires `zfpy` library). + compression_tolerance: Tolerance for ZFP compression in lossy mode. Ignored if + `lossless=True`. Default is 0.01, providing ~70% size reduction. + storage_options: Dictionary of storage options for the MDIO output file (e.g., + cloud credentials). Defaults to None (anonymous access). + overwrite: If True, overwrites existing MDIO file at the specified path. + + Raises: + FileExistsError: If the output location already exists and overwrite is False. + + Examples: + Convert a 3D NumPy array to MDIO format using a template: + + >>> import numpy as np + >>> from mdio.converters import numpy_to_mdio + >>> from mdio.builder.templates.seismic_3d_poststack import Seismic3DPostStackTemplate + >>> + >>> array = np.random.rand(100, 200, 300) + >>> template = Seismic3DPostStackTemplate(data_domain="time") + >>> numpy_to_mdio( + ... array=array, + ... mdio_template=template, + ... output_path="output/file.mdio", + ... chunksize=(64, 64, 64), + ... ) + + For a cloud-based output on AWS S3 with custom coordinates: + + >>> coords = { + ... "inline": np.arange(0, 100, 2), + ... "crossline": np.arange(0, 200, 4), + ... "time": np.linspace(0, 0.3, 300), + ... } + >>> numpy_to_mdio( + ... array=array, + ... mdio_template=template, + ... output_path="s3://bucket/file.mdio", + ... chunksize=(32, 32, 128), + ... index_coords=coords, + ... lossless=False, + ... compression_tolerance=0.01, + ... ) + + Convert a 2D array with default indexing and lossless compression: + + >>> from mdio.builder.templates.seismic_2d_poststack import Seismic2DPostStackTemplate + >>> array_2d = np.random.rand(500, 1000) + >>> template_2d = Seismic2DPostStackTemplate(data_domain="time") + >>> numpy_to_mdio( + ... array=array_2d, + ... mdio_template=template_2d, + ... output_path="output/file_2d.mdio", + ... chunksize=(512, 512), + ... ) + """ + storage_options = storage_options or {} + + # Prepare inputs and set defaults + chunksize, index_coords = _prepare_inputs(array, mdio_template, chunksize, index_coords) + + # Normalize path + output_path = _normalize_path(output_path) + + # Check if output exists + if not overwrite and output_path.exists(): + err = f"Output location '{output_path.as_posix()}' exists. Set `overwrite=True` if intended." + raise FileExistsError(err) + + # Build grid and dataset + grid, mdio_ds = _build_grid_and_dataset( + array=array, + mdio_template=mdio_template, + chunksize=chunksize, + index_coords=index_coords, + lossless=lossless, + compression_tolerance=compression_tolerance, + header_dtype=header_dtype, + ) + + # Convert to xarray dataset + xr_dataset: xr_Dataset = to_xarray_dataset(mdio_ds=mdio_ds) + + # Populate coordinates and write data + _populate_coordinates_and_write( + xr_dataset=xr_dataset, + grid=grid, + output_path=output_path, + array=array, + ) diff --git a/tests/integration/test_import_numpy.py b/tests/integration/test_import_numpy.py new file mode 100644 index 000000000..75e77eec5 --- /dev/null +++ b/tests/integration/test_import_numpy.py @@ -0,0 +1,121 @@ +"""Module for testing NumPy to MDIO conversion functionality. + +This module contains tests for the `numpy_to_mdio` function, ensuring proper conversion +of NumPy arrays to MDIO format using templates. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np +import numpy.testing as npt +import pytest + +from mdio.api.io import open_mdio +from mdio.builder.templates.base import AbstractDatasetTemplate +from mdio.builder.templates.seismic_3d_poststack import Seismic3DPostStackTemplate +from mdio.converters.numpy import numpy_to_mdio + +if TYPE_CHECKING: + from numpy.typing import NDArray + + +class MockTemplate(AbstractDatasetTemplate): + """Mock template for testing.""" + + def __init__(self, dim_names: tuple[str, ...], data_domain: str = "time"): + super().__init__(data_domain=data_domain) + self._dim_names = dim_names + self._physical_coord_names = () + self._logical_coord_names = () + self._var_chunk_shape = (8, 8, 8) + + @property + def _name(self) -> str: + return "MockTemplate" + + def _load_dataset_attributes(self) -> dict: + return {"dataType": "numpy_array", "surveyType": "generic"} + + +@pytest.fixture +def mock_array() -> NDArray: + """Make a mock array.""" + rng = np.random.default_rng() + return rng.uniform(size=(15, 10, 20)).astype("float32") + + +@pytest.fixture +def mock_template() -> AbstractDatasetTemplate: + """Make a mock template.""" + return MockTemplate(("inline", "crossline", "time")) + + +CHUNK_SIZE = (8, 8, 8) + + +def test_npy_to_mdio_basic(mock_array: NDArray, mock_template: AbstractDatasetTemplate) -> None: + """Test basic NumPy to MDIO conversion using a template.""" + numpy_to_mdio(mock_array, mock_template, "memory://npy.mdio", CHUNK_SIZE) + ds = open_mdio("memory://npy.mdio") + + # Check data + data_var = ds.attrs.get("defaultVariableName", "amplitude") + npt.assert_array_equal(ds[data_var].values, mock_array) + + # Check dimensions + assert list(ds.sizes.keys()) == ["inline", "crossline", "time"] + assert ds[data_var].shape == mock_array.shape + + +def test_npy_to_mdio_with_coords(mock_array: NDArray, mock_template: AbstractDatasetTemplate) -> None: + """Test NumPy to MDIO conversion with custom coordinates.""" + index_coords = { + "inline": np.arange(100, 115), + "crossline": np.arange(200, 210), + "time": np.arange(0, 20), + } + numpy_to_mdio( + mock_array, + mock_template, + "memory://npy_coord.mdio", + CHUNK_SIZE, + index_coords, + ) + ds = open_mdio("memory://npy_coord.mdio") + + # Check data + data_var = ds.attrs.get("defaultVariableName", "amplitude") + npt.assert_array_equal(ds[data_var].values, mock_array) + assert ds[data_var].shape == mock_array.shape + + # Check coordinates + npt.assert_array_equal(ds["inline"].values, index_coords["inline"]) + npt.assert_array_equal(ds["crossline"].values, index_coords["crossline"]) + npt.assert_array_equal(ds["time"].values, index_coords["time"]) + + +def test_npy_to_mdio_default_chunksize(mock_array: NDArray, mock_template: AbstractDatasetTemplate) -> None: + """Test NumPy to MDIO conversion using template's default chunk size.""" + numpy_to_mdio(mock_array, mock_template, "memory://npy_default.mdio") + ds = open_mdio("memory://npy_default.mdio") + + # Check data + data_var = ds.attrs.get("defaultVariableName", "amplitude") + npt.assert_array_equal(ds[data_var].values, mock_array) + assert list(ds.sizes.keys()) == ["inline", "crossline", "time"] + assert ds[data_var].shape == mock_array.shape + + +def test_npy_to_mdio_seismic_template(mock_array: NDArray) -> None: + """Test NumPy to MDIO conversion using a seismic template.""" + template = Seismic3DPostStackTemplate(data_domain="time") + numpy_to_mdio(mock_array, template, "memory://npy_seismic.mdio", CHUNK_SIZE) + ds = open_mdio("memory://npy_seismic.mdio") + + # Check data + data_var = ds.attrs.get("defaultVariableName", "amplitude") + npt.assert_array_equal(ds[data_var].values, mock_array) + assert list(ds.sizes.keys()) == ["inline", "crossline", "time"] + assert ds[data_var].shape == mock_array.shape From 135c3c8e01c77966d556a48a69c1c29528e129b6 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Fri, 24 Oct 2025 14:25:07 +0000 Subject: [PATCH 2/3] Simplifications and alignment with v1 ingestion --- src/mdio/converters/numpy.py | 219 +++---------------------- tests/integration/test_import_numpy.py | 5 +- 2 files changed, 28 insertions(+), 196 deletions(-) diff --git a/src/mdio/converters/numpy.py b/src/mdio/converters/numpy.py index 1b04fbd7b..5a07baa1c 100644 --- a/src/mdio/converters/numpy.py +++ b/src/mdio/converters/numpy.py @@ -2,31 +2,17 @@ from __future__ import annotations -import logging from typing import TYPE_CHECKING import numpy as np from mdio.api.io import _normalize_path from mdio.api.io import to_mdio -from mdio.builder.schemas.chunk_grid import RegularChunkGrid -from mdio.builder.schemas.chunk_grid import RegularChunkShape -from mdio.builder.schemas.compressors import Blosc -from mdio.builder.schemas.compressors import BloscCname -from mdio.builder.schemas.dtype import ScalarType -from mdio.builder.schemas.v1.variable import VariableMetadata -from mdio.converters.type_converter import to_scalar_type -from mdio.builder.templates.base import AbstractDatasetTemplate from mdio.builder.xarray_builder import to_xarray_dataset -from mdio.core.dimension import Dimension -from mdio.core.grid import Grid -from mdio.core.utils_write import MAX_COORDINATES_BYTES -from mdio.core.utils_write import MAX_SIZE_LIVE_MASK -from mdio.core.utils_write import get_constrained_chunksize +from mdio.converters.type_converter import to_scalar_type if TYPE_CHECKING: from pathlib import Path - from typing import Any from numpy.typing import DTypeLike from numpy.typing import NDArray @@ -34,159 +20,63 @@ from xarray import Dataset as xr_Dataset from mdio.builder.schemas import Dataset - -logger = logging.getLogger(__name__) + from mdio.builder.templates.base import AbstractDatasetTemplate -def get_compressor(lossless: bool, compression_tolerance: float) -> list: - """Get compressor configuration based on compression settings.""" - if lossless: - return [Blosc(cname=BloscCname.zstd)] - else: - # For lossy compression, we would need ZFP, but let's keep it simple for now - # and use lossless as fallback - logger.warning("Lossy compression not yet implemented, using lossless") - return [Blosc(cname=BloscCname.zstd)] - - -def _prepare_inputs( +def _build_dataset( array: NDArray, mdio_template: AbstractDatasetTemplate, - chunksize: tuple[int, ...] | None, - index_coords: dict[str, NDArray] | None, -) -> tuple[tuple[int, ...], dict[str, NDArray]]: - """Prepare inputs and set defaults for chunksize and coordinates.""" - dim_names = mdio_template.dimension_names - - # Use template's chunk shape if not provided - if chunksize is None: - chunksize = mdio_template.full_chunk_shape - - # Create default coordinates if not provided - if index_coords is None: - index_coords = {} - for name, size in zip(dim_names, array.shape, strict=True): - index_coords[name] = np.arange(size, dtype=np.int32) - - return chunksize, index_coords - - -def _build_grid_and_dataset( - array: NDArray, - mdio_template: AbstractDatasetTemplate, - chunksize: tuple[int, ...], - index_coords: dict[str, NDArray], - lossless: bool, - compression_tolerance: float, header_dtype: DTypeLike | None, -) -> tuple[Grid, Dataset]: - """Build the grid and dataset for the numpy array using the provided template.""" - # Create dimensions - dims = [Dimension(name=name, coords=index_coords[name]) for name in mdio_template.dimension_names] - grid = Grid(dims=dims) - - # Get compressor - compressors = get_compressor(lossless, compression_tolerance) - +) -> Dataset: + """Build the dataset for the numpy array using the provided template.""" # Convert numpy dtype to MDIO ScalarType data_type = to_scalar_type(array.dtype) - # Build dataset + # Build dataset using template (which defines chunking) mdio_ds: Dataset = mdio_template.build_dataset( name=mdio_template.name, sizes=array.shape, header_dtype=header_dtype, ) - # Update the default variable with correct dtype and compressor - var_index = next((i for i, v in enumerate(mdio_ds.variables) if v.name == mdio_template.default_variable_name), None) + # Update the default variable with correct dtype + var_index = next( + (i for i, v in enumerate(mdio_ds.variables) if v.name == mdio_template.default_variable_name), None + ) if var_index is not None: mdio_ds.variables[var_index].data_type = data_type - mdio_ds.variables[var_index].compressor = compressors[0] - - # Set chunk grid for the data variable - chunk_grid = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=chunksize)) - if mdio_ds.variables[var_index].metadata is None: - mdio_ds.variables[var_index].metadata = VariableMetadata() - mdio_ds.variables[var_index].metadata.chunk_grid = chunk_grid - - # Dynamically chunk the trace_mask - _chunk_variable(ds=mdio_ds, target_variable_name="trace_mask") - - # Dynamically chunk coordinate variables - for coord in mdio_template.coordinate_names: - _chunk_variable(ds=mdio_ds, target_variable_name=coord) - return grid, mdio_ds - - -def _chunk_variable(ds: Dataset, target_variable_name: str) -> None: - """Determines and sets the chunking for a specific Variable in the Dataset.""" - # Find variable index by name - index = next((i for i, obj in enumerate(ds.variables) if obj.name == target_variable_name), None) - if index is None: - return - - def determine_target_size(var_type: str) -> int: - """Determines the target size (in bytes) for a Variable based on its type.""" - if var_type == "bool": - return MAX_SIZE_LIVE_MASK - return MAX_COORDINATES_BYTES - - # Create the chunk grid metadata - var_type = ds.variables[index].data_type - full_shape = tuple(dim.size for dim in ds.variables[index].dimensions) - target_size = determine_target_size(var_type) - - chunk_shape = get_constrained_chunksize(full_shape, var_type, target_size) - chunk_grid = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=chunk_shape)) - - # Create variable metadata if it doesn't exist - if ds.variables[index].metadata is None: - ds.variables[index].metadata = VariableMetadata() - - ds.variables[index].metadata.chunk_grid = chunk_grid + return mdio_ds def _populate_coordinates_and_write( xr_dataset: xr_Dataset, - grid: Grid, + index_coords: dict[str, NDArray], output_path: UPath | Path, array: NDArray, ) -> None: """Populate coordinates and write data to MDIO.""" # Populate dimension coordinates - drop_vars_delayed = [] - for dim in grid.dims: - xr_dataset[dim.name].values[:] = dim.coords - drop_vars_delayed.append(dim.name) + for name, coords in index_coords.items(): + xr_dataset[name].data[:] = coords # Set trace mask to all True (no missing data for numpy arrays) xr_dataset.trace_mask.data[:] = True - drop_vars_delayed.append("trace_mask") - # Create data dataset with the numpy array + # Set the data data_var_name = xr_dataset.attrs.get("defaultVariableName", "amplitude") - data_ds = xr_dataset[[data_var_name]].copy() - data_ds[data_var_name].data[:] = array - - # Combine all datasets - full_ds = xr_dataset[drop_vars_delayed].merge(data_ds) + xr_dataset[data_var_name].data[:] = array # Write everything at once - to_mdio(full_ds, output_path=output_path, mode="w", compute=True) + to_mdio(xr_dataset, output_path=output_path, mode="w", compute=True) def numpy_to_mdio( # noqa: PLR0913 array: NDArray, mdio_template: AbstractDatasetTemplate, output_path: UPath | Path | str, - chunksize: tuple[int, ...] | None = None, index_coords: dict[str, NDArray] | None = None, header_dtype: DTypeLike | None = None, - lossless: bool = True, - compression_tolerance: float = 0.01, - storage_options: dict[str, Any] | None = None, overwrite: bool = False, ) -> None: """Convert a NumPy array to MDIO v1 format. @@ -197,74 +87,21 @@ def numpy_to_mdio( # noqa: PLR0913 Args: array: Input NumPy array to be converted to MDIO format. mdio_template: The MDIO template to use for the conversion. The template defines - the dataset structure, but coordinate information is ignored for NumPy arrays. + the dataset structure including compression and chunking settings. output_path: The universal path for the output MDIO v1 file. - chunksize: Tuple specifying the chunk sizes for each dimension of the array. If not - provided, uses the template's default chunk shape. Must match the number of - dimensions in the input array. index_coords: Dictionary mapping dimension names to their coordinate arrays. If not provided, defaults to sequential integers (0 to size-1) for each dimension. header_dtype: Data type for trace headers, if applicable. Defaults to None. - lossless: If True, uses lossless Blosc compression with zstandard. If False, uses ZFP lossy - compression (requires `zfpy` library). - compression_tolerance: Tolerance for ZFP compression in lossy mode. Ignored if - `lossless=True`. Default is 0.01, providing ~70% size reduction. - storage_options: Dictionary of storage options for the MDIO output file (e.g., - cloud credentials). Defaults to None (anonymous access). overwrite: If True, overwrites existing MDIO file at the specified path. Raises: FileExistsError: If the output location already exists and overwrite is False. - - Examples: - Convert a 3D NumPy array to MDIO format using a template: - - >>> import numpy as np - >>> from mdio.converters import numpy_to_mdio - >>> from mdio.builder.templates.seismic_3d_poststack import Seismic3DPostStackTemplate - >>> - >>> array = np.random.rand(100, 200, 300) - >>> template = Seismic3DPostStackTemplate(data_domain="time") - >>> numpy_to_mdio( - ... array=array, - ... mdio_template=template, - ... output_path="output/file.mdio", - ... chunksize=(64, 64, 64), - ... ) - - For a cloud-based output on AWS S3 with custom coordinates: - - >>> coords = { - ... "inline": np.arange(0, 100, 2), - ... "crossline": np.arange(0, 200, 4), - ... "time": np.linspace(0, 0.3, 300), - ... } - >>> numpy_to_mdio( - ... array=array, - ... mdio_template=template, - ... output_path="s3://bucket/file.mdio", - ... chunksize=(32, 32, 128), - ... index_coords=coords, - ... lossless=False, - ... compression_tolerance=0.01, - ... ) - - Convert a 2D array with default indexing and lossless compression: - - >>> from mdio.builder.templates.seismic_2d_poststack import Seismic2DPostStackTemplate - >>> array_2d = np.random.rand(500, 1000) - >>> template_2d = Seismic2DPostStackTemplate(data_domain="time") - >>> numpy_to_mdio( - ... array=array_2d, - ... mdio_template=template_2d, - ... output_path="output/file_2d.mdio", - ... chunksize=(512, 512), - ... ) """ - storage_options = storage_options or {} - - # Prepare inputs and set defaults - chunksize, index_coords = _prepare_inputs(array, mdio_template, chunksize, index_coords) + # Prepare coordinates - create defaults if not provided + if index_coords is None: + index_coords = {} + for name, size in zip(mdio_template.dimension_names, array.shape, strict=True): + index_coords[name] = np.arange(size, dtype=np.int32) # Normalize path output_path = _normalize_path(output_path) @@ -274,14 +111,10 @@ def numpy_to_mdio( # noqa: PLR0913 err = f"Output location '{output_path.as_posix()}' exists. Set `overwrite=True` if intended." raise FileExistsError(err) - # Build grid and dataset - grid, mdio_ds = _build_grid_and_dataset( + # Build dataset + mdio_ds = _build_dataset( array=array, mdio_template=mdio_template, - chunksize=chunksize, - index_coords=index_coords, - lossless=lossless, - compression_tolerance=compression_tolerance, header_dtype=header_dtype, ) @@ -291,7 +124,7 @@ def numpy_to_mdio( # noqa: PLR0913 # Populate coordinates and write data _populate_coordinates_and_write( xr_dataset=xr_dataset, - grid=grid, + index_coords=index_coords, output_path=output_path, array=array, ) diff --git a/tests/integration/test_import_numpy.py b/tests/integration/test_import_numpy.py index 75e77eec5..48e75e862 100644 --- a/tests/integration/test_import_numpy.py +++ b/tests/integration/test_import_numpy.py @@ -57,7 +57,7 @@ def mock_template() -> AbstractDatasetTemplate: def test_npy_to_mdio_basic(mock_array: NDArray, mock_template: AbstractDatasetTemplate) -> None: """Test basic NumPy to MDIO conversion using a template.""" - numpy_to_mdio(mock_array, mock_template, "memory://npy.mdio", CHUNK_SIZE) + numpy_to_mdio(mock_array, mock_template, "memory://npy.mdio") ds = open_mdio("memory://npy.mdio") # Check data @@ -80,7 +80,6 @@ def test_npy_to_mdio_with_coords(mock_array: NDArray, mock_template: AbstractDat mock_array, mock_template, "memory://npy_coord.mdio", - CHUNK_SIZE, index_coords, ) ds = open_mdio("memory://npy_coord.mdio") @@ -111,7 +110,7 @@ def test_npy_to_mdio_default_chunksize(mock_array: NDArray, mock_template: Abstr def test_npy_to_mdio_seismic_template(mock_array: NDArray) -> None: """Test NumPy to MDIO conversion using a seismic template.""" template = Seismic3DPostStackTemplate(data_domain="time") - numpy_to_mdio(mock_array, template, "memory://npy_seismic.mdio", CHUNK_SIZE) + numpy_to_mdio(mock_array, template, "memory://npy_seismic.mdio") ds = open_mdio("memory://npy_seismic.mdio") # Check data From 9868df271874e581c017181e9e902524b189b71e Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Fri, 24 Oct 2025 15:10:08 +0000 Subject: [PATCH 3/3] Add input validation --- src/mdio/converters/numpy.py | 40 ++++++++++++++++++++++ tests/integration/test_import_numpy.py | 47 ++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/src/mdio/converters/numpy.py b/src/mdio/converters/numpy.py index 5a07baa1c..f4d213680 100644 --- a/src/mdio/converters/numpy.py +++ b/src/mdio/converters/numpy.py @@ -49,6 +49,42 @@ def _build_dataset( return mdio_ds +def _validate_coordinates( + index_coords: dict[str, NDArray], + mdio_template: AbstractDatasetTemplate, + array: NDArray, +) -> None: + """Validate user-provided coordinates against template and array dimensions. + + Args: + index_coords: Dictionary mapping dimension names to coordinate arrays. + mdio_template: The MDIO template defining expected dimensions. + array: The numpy array being converted. + + Raises: + ValueError: If coordinate names or sizes don't match requirements. + """ + # Validate that coordinate names match template dimension names + for coord_name in index_coords: + if coord_name not in mdio_template.dimension_names: + available_dims = sorted(mdio_template.dimension_names) + err = ( + f"Coordinate name '{coord_name}' not found in template dimensions. " + f"Available dimensions: {available_dims}" + ) + raise ValueError(err) + + # Validate coordinate array sizes match array dimensions + for dim_name, coord_array in index_coords.items(): + expected_size = array.shape[mdio_template.dimension_names.index(dim_name)] + if coord_array.size != expected_size: + err = ( + f"Size of coordinate '{dim_name}' ({coord_array.size}) does not match " + f"array dimension size ({expected_size})" + ) + raise ValueError(err) + + def _populate_coordinates_and_write( xr_dataset: xr_Dataset, index_coords: dict[str, NDArray], @@ -111,6 +147,10 @@ def numpy_to_mdio( # noqa: PLR0913 err = f"Output location '{output_path.as_posix()}' exists. Set `overwrite=True` if intended." raise FileExistsError(err) + # Validate coordinates if provided + if index_coords: + _validate_coordinates(index_coords, mdio_template, array) + # Build dataset mdio_ds = _build_dataset( array=array, diff --git a/tests/integration/test_import_numpy.py b/tests/integration/test_import_numpy.py index 48e75e862..bb86fb824 100644 --- a/tests/integration/test_import_numpy.py +++ b/tests/integration/test_import_numpy.py @@ -118,3 +118,50 @@ def test_npy_to_mdio_seismic_template(mock_array: NDArray) -> None: npt.assert_array_equal(ds[data_var].values, mock_array) assert list(ds.sizes.keys()) == ["inline", "crossline", "time"] assert ds[data_var].shape == mock_array.shape + + +def test_npy_to_mdio_invalid_coordinate_names(mock_array: NDArray, mock_template: AbstractDatasetTemplate) -> None: + """Test NumPy to MDIO conversion with invalid coordinate names.""" + index_coords = { + "inline": np.arange(15), # valid + "crossline": np.arange(10), # valid + "invalid_dim": np.arange(20), # invalid dimension name + } + + with pytest.raises(ValueError, match=r"Coordinate name 'invalid_dim' not found in template dimensions"): + numpy_to_mdio(mock_array, mock_template, "memory://npy_invalid.mdio", index_coords=index_coords) + + +def test_npy_to_mdio_invalid_coordinate_sizes(mock_array: NDArray, mock_template: AbstractDatasetTemplate) -> None: + """Test NumPy to MDIO conversion with invalid coordinate array sizes.""" + index_coords = { + "inline": np.arange(10), # wrong size (should be 15) + "crossline": np.arange(10), # valid + "time": np.arange(20), # valid + } + + with pytest.raises( + ValueError, match=r"Size of coordinate 'inline' \(10\) does not match array dimension size \(15\)" + ): + numpy_to_mdio(mock_array, mock_template, "memory://npy_wrong_size.mdio", index_coords=index_coords) + + +def test_npy_to_mdio_valid_custom_coordinates(mock_array: NDArray, mock_template: AbstractDatasetTemplate) -> None: + """Test NumPy to MDIO conversion with valid custom coordinates.""" + index_coords = { + "inline": np.arange(100, 115), # valid size (15) + "crossline": np.arange(200, 210), # valid size (10) + "time": np.arange(0, 20), # valid size (20) + } + + numpy_to_mdio(mock_array, mock_template, "memory://npy_valid_custom.mdio", index_coords=index_coords) + ds = open_mdio("memory://npy_valid_custom.mdio") + + # Check data + data_var = ds.attrs.get("defaultVariableName", "amplitude") + npt.assert_array_equal(ds[data_var].values, mock_array) + + # Check coordinates are correctly set + npt.assert_array_equal(ds["inline"].values, index_coords["inline"]) + npt.assert_array_equal(ds["crossline"].values, index_coords["crossline"]) + npt.assert_array_equal(ds["time"].values, index_coords["time"])