From 2fd7cd9a9540a7e8cc2fddc5ec2c91f8b4f1099f Mon Sep 17 00:00:00 2001
From: Bert Vandenbroucke <bert.vandenbroucke@gmail.com>
Date: Fri, 4 Feb 2022 17:21:02 +0100
Subject: [PATCH 1/4] Started implementing support for split catalogues. Needs
 proper testing.

---
 velociraptor/catalogue/catalogue.py | 33 ++++++-----------
 velociraptor/catalogue/reader.py    | 57 +++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 21 deletions(-)
 create mode 100644 velociraptor/catalogue/reader.py

diff --git a/velociraptor/catalogue/catalogue.py b/velociraptor/catalogue/catalogue.py
index efe7221..cb6f20b 100644
--- a/velociraptor/catalogue/catalogue.py
+++ b/velociraptor/catalogue/catalogue.py
@@ -15,6 +15,7 @@
 from velociraptor.catalogue.derived import DerivedQuantities
 from velociraptor.catalogue.registration import global_registration_functions
 from velociraptor.exceptions import RegistrationDoesNotMatchError
+from velociraptor.catalogue.reader import VelociraptorCatalogueReader
 
 
 class VelociraptorFieldMetadata(object):
@@ -88,7 +89,7 @@ def register_field_properties(self):
         return
 
 
-def generate_getter(filename, name: str, field: str, full_name: str, unit):
+def generate_getter(reader, name: str, field: str, full_name: str, unit):
     """
     Generates a function that:
 
@@ -113,14 +114,9 @@ def getter(self):
         if current_value is not None:
             return current_value
         else:
-            with h5py.File(filename, "r") as handle:
-                try:
-                    setattr(self, f"_{name}", unyt.unyt_array(handle[field][...], unit))
-                    getattr(self, f"_{name}").name = full_name
-                    getattr(self, f"_{name}").file = filename
-                except KeyError:
-                    print(f"Could not read {field}")
-                    return None
+            setattr(self, f"_{name}", unyt.unyt_array(reader.read_field(field), unit))
+            getattr(self, f"_{name}").name = full_name
+            getattr(self, f"_{name}").file = reader.filenames[0]
 
         return getattr(self, f"_{name}")
 
@@ -156,7 +152,7 @@ def deleter(self):
 
 
 def generate_sub_catalogue(
-    filename,
+    reader,
     registration_name: str,
     registration_function: Callable,
     units: VelociraptorUnits,
@@ -173,10 +169,7 @@ def generate_sub_catalogue(
     """
 
     # This creates a _copy_ of the _class_, not object.
-    this_sub_catalogue_bases = (
-        __VelociraptorSubCatalogue,
-        object,
-    )
+    this_sub_catalogue_bases = (__VelociraptorSubCatalogue, object)
     this_sub_catalogue_dict = {}
 
     valid_sub_paths = []
@@ -186,11 +179,7 @@ def generate_sub_catalogue(
 
         this_sub_catalogue_dict[metadata.snake_case] = property(
             generate_getter(
-                filename,
-                metadata.snake_case,
-                metadata.path,
-                metadata.name,
-                metadata.unit,
+                reader, metadata.snake_case, metadata.path, metadata.name, metadata.unit
             ),
             generate_setter(metadata.snake_case),
             generate_deleter(metadata.snake_case),
@@ -205,7 +194,7 @@ def generate_sub_catalogue(
     )
 
     # Finally, we can actually create an instance of our new class.
-    catalogue = ThisSubCatalogue(filename=filename)
+    catalogue = ThisSubCatalogue(filename=reader.filenames[0])
     catalogue.valid_sub_paths = valid_sub_paths
 
     return catalogue
@@ -375,6 +364,8 @@ def __create_sub_catalogues(self):
             else:
                 self.invalid_field_paths.append(path)
 
+        reader = VelociraptorCatalogueReader(self.filename)
+
         # For each registration function, we create a dynamic sub-class that
         # contains only that information - otherwise the namespace of the
         # VelociraptorCatalogue is way too crowded.
@@ -383,7 +374,7 @@ def __create_sub_catalogues(self):
                 self,
                 attribute_name,
                 generate_sub_catalogue(
-                    filename=self.filename,
+                    reader=reader,
                     registration_name=attribute_name,  # This ensures each class has a unique name
                     registration_function=self.registration_functions[attribute_name],
                     units=self.units,
diff --git a/velociraptor/catalogue/reader.py b/velociraptor/catalogue/reader.py
new file mode 100644
index 0000000..71115dc
--- /dev/null
+++ b/velociraptor/catalogue/reader.py
@@ -0,0 +1,57 @@
+"""
+Main objects for the velociraptor reading library.
+
+This is based upon the reading routines in the SWIFTsimIO library.
+"""
+
+import h5py
+import re
+import numpy as np
+
+
+class VelociraptorCatalogueReader(object):
+    def __init__(self, filename):
+        with h5py.File(filename, "r") as handle:
+            num_files = handle["Num_of_files"][0]
+        if num_files == 1:
+            self.filenames = [filename]
+        else:
+            basename = re.match("(\S+properties)\.\d+\Z", filename).groups()[0]
+            self.filenames = [f"{basename}.{idx}" for idx in range(num_files)]
+
+    def read_field(self, field):
+        if len(self.filenames) == 1:
+            with h5py.File(self.filenames[0], "r") as handle:
+                try:
+                    value = handle[field][...]
+                except KeyError:
+                    print(f"Could not read {field}")
+                    return None
+            return value
+        else:
+            # figure out the shape and dtype of the return value
+            dtype = None
+            shape = None
+            for filename in self.filenames:
+                with h5py.File(filename, "r") as handle:
+                    try:
+                        ds = handle[field]
+                    except KeyError:
+                        print(f"Could not read {field}")
+                        return None
+                    if dtype is None:
+                        dtype = ds.dtype
+                        shape = ds.shape
+                    else:
+                        shape[0] += ds.shape[0]
+
+            # create an empty array to store the return value
+            value = np.zeros(shape, dtype=dtype)
+            # now read the data (no need to check for existence again)
+            offset = 0
+            for filename in self.filenames:
+                with h5py.File(filename, "r") as handle:
+                    size = handle[field].shape[0]
+                    value[offset : offset + size] = handle[field][...]
+                    offset += size
+            return value

From 1914c30c4af0335fd43108e973155fefad067f15 Mon Sep 17 00:00:00 2001
From: Bert Vandenbroucke <vandenbroucke@strw.leidenuniv.nl>
Date: Wed, 9 Feb 2022 13:30:26 +0100
Subject: [PATCH 2/4] Tested split catalogue reader, fixed some small issues
 and added documentation.

---
 velociraptor/catalogue/reader.py | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/velociraptor/catalogue/reader.py b/velociraptor/catalogue/reader.py
index 71115dc..d695da9 100644
--- a/velociraptor/catalogue/reader.py
+++ b/velociraptor/catalogue/reader.py
@@ -10,12 +10,29 @@
 
 
 class VelociraptorCatalogueReader(object):
+    """
+    VELOCIraptor catalogue reader. Pass it the name of a catalogue file and it
+    will detect whether this catalogue is self-contained or part of a larger
+    split catalogue consisting of multiple files.
+
+    When a split catalogue is used, any of the catalogue.properties.X files can
+    be passed on to the constructor, where X is a counter ranging from 0 to
+    properties_file["Num_of_files"]-1. When a dataset is extracted from such a
+    catalogue, the elements in the resulting dataset will be ordered in blocks
+    of increasing X.
+
+    For split catalogues, this class's read_field() method handles reading the
+    distributed datasets. For unsplit catalogues, it behaves exactly the same
+    as a direct read from the HDF5 file.
+    """
+
     def __init__(self, filename):
         with h5py.File(filename, "r") as handle:
             num_files = handle["Num_of_files"][0]
         if num_files == 1:
             self.filenames = [filename]
         else:
+            # compose the other file names
             basename = re.match("(\S+properties)\.\d+\Z", filename).groups()[0]
             self.filenames = [f"{basename}.{idx}" for idx in range(num_files)]
 
@@ -29,7 +46,8 @@ def read_field(self, field):
                     return None
             return value
         else:
-            # figure out the shape and dtype of the return value
+            # figure out the shape and dtype of the return value, so that we can
+            # create the appropriate array
             dtype = None
             shape = None
             for filename in self.filenames:
@@ -43,11 +61,17 @@ def read_field(self, field):
                         dtype = ds.dtype
                         shape = ds.shape
                     else:
-                        shape[0] += ds.shape[0]
+                        # tuples are immutable, so instead of
+                        # shape[0]+= ds.shape[0], we have to unpack, sum and
+                        # then pack again
+                        shape0, *shaperest = shape
+                        shape0 += ds.shape[0]
+                        shape = (shape0, *shaperest)
 
             # create an empty array to store the return value
             value = np.zeros(shape, dtype=dtype)
-            # now read the data (no need to check for existence again)
+            # now read the data (no need to check for existence again, this was
+            # done when getting the shape and type)
             offset = 0
             for filename in self.filenames:
                 with h5py.File(filename, "r") as handle:

From 4f3a9641086d8e975db1efd7182d9b4e64e12fd7 Mon Sep 17 00:00:00 2001
From: Bert Vandenbroucke <vandenbroucke@strw.leidenuniv.nl>
Date: Thu, 17 Feb 2022 12:06:14 +0100
Subject: [PATCH 3/4] Added type hints, made reader.filename property.

---
 velociraptor/catalogue/catalogue.py |  4 ++--
 velociraptor/catalogue/reader.py    | 24 ++++++++++++++++++++++--
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/velociraptor/catalogue/catalogue.py b/velociraptor/catalogue/catalogue.py
index cb6f20b..e84eed7 100644
--- a/velociraptor/catalogue/catalogue.py
+++ b/velociraptor/catalogue/catalogue.py
@@ -116,7 +116,7 @@ def getter(self):
         else:
             setattr(self, f"_{name}", unyt.unyt_array(reader.read_field(field), unit))
             getattr(self, f"_{name}").name = full_name
-            getattr(self, f"_{name}").file = reader.filenames[0]
+            getattr(self, f"_{name}").file = reader.filename
 
         return getattr(self, f"_{name}")
 
@@ -194,7 +194,7 @@ def generate_sub_catalogue(
     )
 
     # Finally, we can actually create an instance of our new class.
-    catalogue = ThisSubCatalogue(filename=reader.filenames[0])
+    catalogue = ThisSubCatalogue(filename=reader.filename)
     catalogue.valid_sub_paths = valid_sub_paths
 
     return catalogue
diff --git a/velociraptor/catalogue/reader.py b/velociraptor/catalogue/reader.py
index d695da9..94b34e2 100644
--- a/velociraptor/catalogue/reader.py
+++ b/velociraptor/catalogue/reader.py
@@ -8,6 +8,7 @@
 import re
 import numpy as np
 
+from typing import List
 
 class VelociraptorCatalogueReader(object):
     """
@@ -26,7 +27,15 @@ class VelociraptorCatalogueReader(object):
     as a direct read from the HDF5 file.
     """
 
-    def __init__(self, filename):
+    # List of files that make up the catalogue
+    filenames: List[str]
+
+    def __init__(self, filename: str):
+        """
+        I take in:
+
+        + filename of (one of) the velociraptor properties file(s)
+        """
         with h5py.File(filename, "r") as handle:
             num_files = handle["Num_of_files"][0]
         if num_files == 1:
@@ -36,7 +45,18 @@ def __init__(self, filename):
             basename = re.match("(\S+properties)\.\d+\Z", filename).groups()[0]
             self.filenames = [f"{basename}.{idx}" for idx in range(num_files)]
 
-    def read_field(self, field):
+    @property
+    def filename(self):
+        """
+        Returns the velociraptor properties file name or the first file name
+        if the catalogue is split
+        """
+        return self.filenames[0]
+
+    def read_field(self, field: str):
+        """
+        Read the given field from the catalogue file(s)
+        """
         if len(self.filenames) == 1:
             with h5py.File(self.filenames[0], "r") as handle:
                 try:

From b9f74234a81893efc72b388e25ee5d79dd828764 Mon Sep 17 00:00:00 2001
From: Bert Vandenbroucke <vandenbroucke@strw.leidenuniv.nl>
Date: Fri, 11 Mar 2022 10:44:27 +0100
Subject: [PATCH 4/4] Fixed error in VelociraptorCatalogueReader when filename
 is a pathlib.Path.

---
 velociraptor/catalogue/reader.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/velociraptor/catalogue/reader.py b/velociraptor/catalogue/reader.py
index 94b34e2..79894fe 100644
--- a/velociraptor/catalogue/reader.py
+++ b/velociraptor/catalogue/reader.py
@@ -10,6 +10,7 @@
 
 from typing import List
 
+
 class VelociraptorCatalogueReader(object):
     """
     VELOCIraptor catalogue reader. Pass it the name of a catalogue file and it
@@ -42,7 +43,8 @@ def __init__(self, filename: str):
             self.filenames = [filename]
         else:
             # compose the other file names
-            basename = re.match("(\S+properties)\.\d+\Z", filename).groups()[0]
+            # we cast to str() because filename could be a pathlib.Path
+            basename = re.match("(\S+properties)\.\d+\Z", str(filename)).groups()[0]
             self.filenames = [f"{basename}.{idx}" for idx in range(num_files)]
 
     @property