From 2fd7cd9a9540a7e8cc2fddc5ec2c91f8b4f1099f Mon Sep 17 00:00:00 2001 From: Bert Vandenbroucke Date: Fri, 4 Feb 2022 17:21:02 +0100 Subject: [PATCH 1/4] Started implementing support for split catalogues. Needs proper testing. --- velociraptor/catalogue/catalogue.py | 33 ++++++----------- velociraptor/catalogue/reader.py | 57 +++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 21 deletions(-) create mode 100644 velociraptor/catalogue/reader.py diff --git a/velociraptor/catalogue/catalogue.py b/velociraptor/catalogue/catalogue.py index efe7221..cb6f20b 100644 --- a/velociraptor/catalogue/catalogue.py +++ b/velociraptor/catalogue/catalogue.py @@ -15,6 +15,7 @@ from velociraptor.catalogue.derived import DerivedQuantities from velociraptor.catalogue.registration import global_registration_functions from velociraptor.exceptions import RegistrationDoesNotMatchError +from velociraptor.catalogue.reader import VelociraptorCatalogueReader class VelociraptorFieldMetadata(object): @@ -88,7 +89,7 @@ def register_field_properties(self): return -def generate_getter(filename, name: str, field: str, full_name: str, unit): +def generate_getter(reader, name: str, field: str, full_name: str, unit): """ Generates a function that: @@ -113,14 +114,9 @@ def getter(self): if current_value is not None: return current_value else: - with h5py.File(filename, "r") as handle: - try: - setattr(self, f"_{name}", unyt.unyt_array(handle[field][...], unit)) - getattr(self, f"_{name}").name = full_name - getattr(self, f"_{name}").file = filename - except KeyError: - print(f"Could not read {field}") - return None + setattr(self, f"_{name}", unyt.unyt_array(reader.read_field(field), unit)) + getattr(self, f"_{name}").name = full_name + getattr(self, f"_{name}").file = reader.filenames[0] return getattr(self, f"_{name}") @@ -156,7 +152,7 @@ def deleter(self): def generate_sub_catalogue( - filename, + reader, registration_name: str, registration_function: Callable, units: VelociraptorUnits, @@ -173,10 +169,7 @@ def generate_sub_catalogue( """ # This creates a _copy_ of the _class_, not object. - this_sub_catalogue_bases = ( - __VelociraptorSubCatalogue, - object, - ) + this_sub_catalogue_bases = (__VelociraptorSubCatalogue, object) this_sub_catalogue_dict = {} valid_sub_paths = [] @@ -186,11 +179,7 @@ def generate_sub_catalogue( this_sub_catalogue_dict[metadata.snake_case] = property( generate_getter( - filename, - metadata.snake_case, - metadata.path, - metadata.name, - metadata.unit, + reader, metadata.snake_case, metadata.path, metadata.name, metadata.unit ), generate_setter(metadata.snake_case), generate_deleter(metadata.snake_case), @@ -205,7 +194,7 @@ def generate_sub_catalogue( ) # Finally, we can actually create an instance of our new class. - catalogue = ThisSubCatalogue(filename=filename) + catalogue = ThisSubCatalogue(filename=reader.filenames[0]) catalogue.valid_sub_paths = valid_sub_paths return catalogue @@ -375,6 +364,8 @@ def __create_sub_catalogues(self): else: self.invalid_field_paths.append(path) + reader = VelociraptorCatalogueReader(self.filename) + # For each registration function, we create a dynamic sub-class that # contains only that information - otherwise the namespace of the # VelociraptorCatalogue is way too crowded. @@ -383,7 +374,7 @@ def __create_sub_catalogues(self): self, attribute_name, generate_sub_catalogue( - filename=self.filename, + reader=reader, registration_name=attribute_name, # This ensures each class has a unique name registration_function=self.registration_functions[attribute_name], units=self.units, diff --git a/velociraptor/catalogue/reader.py b/velociraptor/catalogue/reader.py new file mode 100644 index 0000000..71115dc --- /dev/null +++ b/velociraptor/catalogue/reader.py @@ -0,0 +1,57 @@ +""" +Main objects for the velociraptor reading library. + +This is based upon the reading routines in the SWIFTsimIO library. +""" + +import h5py +import re +import numpy as np + + +class VelociraptorCatalogueReader(object): + def __init__(self, filename): + with h5py.File(filename, "r") as handle: + num_files = handle["Num_of_files"][0] + if num_files == 1: + self.filenames = [filename] + else: + basename = re.match("(\S+properties)\.\d+\Z", filename).groups()[0] + self.filenames = [f"{basename}.{idx}" for idx in range(num_files)] + + def read_field(self, field): + if len(self.filenames) == 1: + with h5py.File(self.filenames[0], "r") as handle: + try: + value = handle[field][...] + except KeyError: + print(f"Could not read {field}") + return None + return value + else: + # figure out the shape and dtype of the return value + dtype = None + shape = None + for filename in self.filenames: + with h5py.File(filename, "r") as handle: + try: + ds = handle[field] + except KeyError: + print(f"Could not read {field}") + return None + if dtype is None: + dtype = ds.dtype + shape = ds.shape + else: + shape[0] += ds.shape[0] + + # create an empty array to store the return value + value = np.zeros(shape, dtype=dtype) + # now read the data (no need to check for existence again) + offset = 0 + for filename in self.filenames: + with h5py.File(filename, "r") as handle: + size = handle[field].shape[0] + value[offset : offset + size] = handle[field][...] + offset += size + return value From 1914c30c4af0335fd43108e973155fefad067f15 Mon Sep 17 00:00:00 2001 From: Bert Vandenbroucke Date: Wed, 9 Feb 2022 13:30:26 +0100 Subject: [PATCH 2/4] Tested split catalogue reader, fixed some small issues and added documentation. --- velociraptor/catalogue/reader.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/velociraptor/catalogue/reader.py b/velociraptor/catalogue/reader.py index 71115dc..d695da9 100644 --- a/velociraptor/catalogue/reader.py +++ b/velociraptor/catalogue/reader.py @@ -10,12 +10,29 @@ class VelociraptorCatalogueReader(object): + """ + VELOCIraptor catalogue reader. Pass it the name of a catalogue file and it + will detect whether this catalogue is self-contained or part of a larger + split catalogue consisting of multiple files. + + When a split catalogue is used, any of the catalogue.properties.X files can + be passed on to the constructor, where X is a counter ranging from 0 to + properties_file["Num_of_files"]-1. When a dataset is extracted from such a + catalogue, the elements in the resulting dataset will be ordered in blocks + of increasing X. + + For split catalogues, this class's read_field() method handles reading the + distributed datasets. For unsplit catalogues, it behaves exactly the same + as a direct read from the HDF5 file. + """ + def __init__(self, filename): with h5py.File(filename, "r") as handle: num_files = handle["Num_of_files"][0] if num_files == 1: self.filenames = [filename] else: + # compose the other file names basename = re.match("(\S+properties)\.\d+\Z", filename).groups()[0] self.filenames = [f"{basename}.{idx}" for idx in range(num_files)] @@ -29,7 +46,8 @@ def read_field(self, field): return None return value else: - # figure out the shape and dtype of the return value + # figure out the shape and dtype of the return value, so that we can + # create the appropriate array dtype = None shape = None for filename in self.filenames: @@ -43,11 +61,17 @@ def read_field(self, field): dtype = ds.dtype shape = ds.shape else: - shape[0] += ds.shape[0] + # tuples are immutable, so instead of + # shape[0]+= ds.shape[0], we have to unpack, sum and + # then pack again + shape0, *shaperest = shape + shape0 += ds.shape[0] + shape = (shape0, *shaperest) # create an empty array to store the return value value = np.zeros(shape, dtype=dtype) - # now read the data (no need to check for existence again) + # now read the data (no need to check for existence again, this was + # done when getting the shape and type) offset = 0 for filename in self.filenames: with h5py.File(filename, "r") as handle: From 4f3a9641086d8e975db1efd7182d9b4e64e12fd7 Mon Sep 17 00:00:00 2001 From: Bert Vandenbroucke Date: Thu, 17 Feb 2022 12:06:14 +0100 Subject: [PATCH 3/4] Added type hints, made reader.filename property. --- velociraptor/catalogue/catalogue.py | 4 ++-- velociraptor/catalogue/reader.py | 24 ++++++++++++++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/velociraptor/catalogue/catalogue.py b/velociraptor/catalogue/catalogue.py index cb6f20b..e84eed7 100644 --- a/velociraptor/catalogue/catalogue.py +++ b/velociraptor/catalogue/catalogue.py @@ -116,7 +116,7 @@ def getter(self): else: setattr(self, f"_{name}", unyt.unyt_array(reader.read_field(field), unit)) getattr(self, f"_{name}").name = full_name - getattr(self, f"_{name}").file = reader.filenames[0] + getattr(self, f"_{name}").file = reader.filename return getattr(self, f"_{name}") @@ -194,7 +194,7 @@ def generate_sub_catalogue( ) # Finally, we can actually create an instance of our new class. - catalogue = ThisSubCatalogue(filename=reader.filenames[0]) + catalogue = ThisSubCatalogue(filename=reader.filename) catalogue.valid_sub_paths = valid_sub_paths return catalogue diff --git a/velociraptor/catalogue/reader.py b/velociraptor/catalogue/reader.py index d695da9..94b34e2 100644 --- a/velociraptor/catalogue/reader.py +++ b/velociraptor/catalogue/reader.py @@ -8,6 +8,7 @@ import re import numpy as np +from typing import List class VelociraptorCatalogueReader(object): """ @@ -26,7 +27,15 @@ class VelociraptorCatalogueReader(object): as a direct read from the HDF5 file. """ - def __init__(self, filename): + # List of files that make up the catalogue + filenames: List[str] + + def __init__(self, filename: str): + """ + I take in: + + + filename of (one of) the velociraptor properties file(s) + """ with h5py.File(filename, "r") as handle: num_files = handle["Num_of_files"][0] if num_files == 1: @@ -36,7 +45,18 @@ def __init__(self, filename): basename = re.match("(\S+properties)\.\d+\Z", filename).groups()[0] self.filenames = [f"{basename}.{idx}" for idx in range(num_files)] - def read_field(self, field): + @property + def filename(self): + """ + Returns the velociraptor properties file name or the first file name + if the catalogue is split + """ + return self.filenames[0] + + def read_field(self, field: str): + """ + Read the given field from the catalogue file(s) + """ if len(self.filenames) == 1: with h5py.File(self.filenames[0], "r") as handle: try: From b9f74234a81893efc72b388e25ee5d79dd828764 Mon Sep 17 00:00:00 2001 From: Bert Vandenbroucke Date: Fri, 11 Mar 2022 10:44:27 +0100 Subject: [PATCH 4/4] Fixed error in VelociraptorCatalogueReader when filename is a pathlib.Path. --- velociraptor/catalogue/reader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/velociraptor/catalogue/reader.py b/velociraptor/catalogue/reader.py index 94b34e2..79894fe 100644 --- a/velociraptor/catalogue/reader.py +++ b/velociraptor/catalogue/reader.py @@ -10,6 +10,7 @@ from typing import List + class VelociraptorCatalogueReader(object): """ VELOCIraptor catalogue reader. Pass it the name of a catalogue file and it @@ -42,7 +43,8 @@ def __init__(self, filename: str): self.filenames = [filename] else: # compose the other file names - basename = re.match("(\S+properties)\.\d+\Z", filename).groups()[0] + # we cast to str() because filename could be a pathlib.Path + basename = re.match("(\S+properties)\.\d+\Z", str(filename)).groups()[0] self.filenames = [f"{basename}.{idx}" for idx in range(num_files)] @property