From 438a3c10cc33732fabb872fa8d7dd71468d976cf Mon Sep 17 00:00:00 2001
From: kassyray <kassy.raymond@wdgpublichealth.ca>
Date: Tue, 18 Nov 2025 20:00:09 +0000
Subject: [PATCH 01/10] This almost works but the date is still showing up.
 Committing as a roll back point

---
 pipeline/preprocess.py | 55 +++++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 33 deletions(-)

diff --git a/pipeline/preprocess.py b/pipeline/preprocess.py
index 6209d54..94f2e8a 100644
--- a/pipeline/preprocess.py
+++ b/pipeline/preprocess.py
@@ -652,55 +652,42 @@ def enrich_grouped_records(
     vaccine_reference: Dict[str, Any],
     language: str,
     chart_diseases_header: List[str] | None = None,
+    ignore_diseases: List[str] | None = None,
 ) -> List[Dict[str, Any]]:
     """Enrich grouped vaccine records with disease information.
 
     If chart_diseases_header is provided, diseases not in the list are
-    collapsed into the "Other" category.
+    collapsed into the "Other" category. Diseases in ignore_diseases
+    are always removed.
+    """
 
-    Parameters
-    ----------
-    grouped : List[Dict[str, Any]]
-        Grouped vaccine records with date_given and vaccine list.
-    vaccine_reference : Dict[str, Any]
-        Map of vaccine codes to disease names.
-    language : str
-        Language code for logging.
-    chart_diseases_header : List[str], optional
-        List of diseases to include in chart. Diseases not in this list
-        are mapped to "Other".
+    print(grouped)
 
-    Returns
-    -------
-    List[Dict[str, Any]]
-        Enriched records with date_given, vaccine, and diseases fields.
-    """
     enriched: List[Dict[str, Any]] = []
+
     for item in grouped:
+        # Normalize vaccine names
         vaccines = [
             v.replace("-unspecified", "*").replace(" unspecified", "*")
             for v in item["vaccine"]
         ]
+
+        # Lookup diseases
         diseases: List[str] = []
         for vaccine in vaccines:
+            # See if the vaccine has a mapping to diseases
+            has_mapping = vaccine in vaccine_reference
+            if not has_mapping:
+                # Do not add to list
+                # Remove from vaccines list
+                # Remove whole item from grouped if no vaccines left
+                grouped.remove(item)
+                break
+
             ref = vaccine_reference.get(vaccine, vaccine)
             if isinstance(ref, list):
                 diseases.extend(ref)
-            else:
-                diseases.append(ref)
-
-        # Collapse diseases not in chart to "Other"
-        if chart_diseases_header:
-            filtered_diseases: List[str] = []
-            has_unmapped = False
-            for disease in diseases:
-                if disease in chart_diseases_header:
-                    filtered_diseases.append(disease)
-                else:
-                    has_unmapped = True
-            if has_unmapped and "Other" not in filtered_diseases:
-                filtered_diseases.append("Other")
-            diseases = filtered_diseases
+
 
         enriched.append(
             {
@@ -709,6 +696,7 @@ def enrich_grouped_records(
                 "diseases": diseases,
             }
         )
+
     return enriched
 
 
@@ -717,6 +705,7 @@ def build_preprocess_result(
     language: str,
     vaccine_reference: Dict[str, Any],
     replace_unspecified: List[str],
+    ignore_diseases
 ) -> PreprocessResult:
     """Process and normalize client data into structured artifact.
 
@@ -790,7 +779,7 @@ def build_preprocess_result(
         ]
         received_grouped = process_received_agents(row.IMMS_GIVEN, replace_unspecified)  # type: ignore[attr-defined]
         received = enrich_grouped_records(
-            received_grouped, vaccine_reference, language, chart_diseases_header
+            received_grouped, vaccine_reference, language, chart_diseases_header, ignore_diseases=None
         )
 
         postal_code = row.POSTAL_CODE if row.POSTAL_CODE else "Not provided"  # type: ignore[attr-defined]

From 65362a7b150aa66bb2b4c2aca3a16cce32dea8fa Mon Sep 17 00:00:00 2001
From: kassyray <kassy.raymond@wdgpublichealth.ca>
Date: Tue, 18 Nov 2025 20:03:35 +0000
Subject: [PATCH 02/10] Another checkpoint. This is working but there is some
 erroneous behaviour when ignore diseases that have multiple mappings.

---
 pipeline/orchestrator.py | 22 +++++++++++++++++++++-
 pipeline/preprocess.py   | 19 ++++++++++---------
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/pipeline/orchestrator.py b/pipeline/orchestrator.py
index 9517c6f..03c1eaf 100755
--- a/pipeline/orchestrator.py
+++ b/pipeline/orchestrator.py
@@ -209,6 +209,7 @@ def run_step_2_preprocess(
     output_dir: Path,
     language: str,
     run_id: str,
+    config_dir: Path,
 ) -> int:
     """Step 2: Preprocessing.
 
@@ -231,12 +232,30 @@ def run_step_2_preprocess(
     df = preprocess.check_addresses_complete(df)
 
     # Load configuration
+    config = load_config(config_dir / "parameters.yaml")
+    ignore_diseases = config.get("ignore_diseases", {})
+    
     vaccine_reference_path = preprocess.VACCINE_REFERENCE_PATH
     vaccine_reference = json.loads(vaccine_reference_path.read_text(encoding="utf-8"))
 
+    # Filter out vaccines that include any ignored disease
+    if ignore_diseases:
+        ignore_set = set(ignore_diseases)
+
+        filtered_reference = {
+            vaccine: agents
+            for vaccine, agents in vaccine_reference.items()
+            if ignore_set.isdisjoint(agents)
+        }
+        print(f"Ignored diseases: {', '.join(sorted(ignore_set))}")
+        print(f"Filtered vaccine reference to {len(filtered_reference)} vaccines "
+              f"from {len(vaccine_reference)} total.")
+    else:
+        filtered_reference = vaccine_reference
+
     # Build preprocessing result
     result = preprocess.build_preprocess_result(
-        df, language, vaccine_reference, preprocess.REPLACE_UNSPECIFIED
+        df, language, filtered_reference, preprocess.REPLACE_UNSPECIFIED, ignore_diseases
     )
 
     # Write artifact
@@ -574,6 +593,7 @@ def main() -> int:
             output_dir,
             args.language,
             run_id,
+            config_dir
         )
         step_duration = time.time() - step_start
         step_times.append(("Preprocessing", step_duration))
diff --git a/pipeline/preprocess.py b/pipeline/preprocess.py
index 94f2e8a..9f217a6 100644
--- a/pipeline/preprocess.py
+++ b/pipeline/preprocess.py
@@ -684,17 +684,18 @@ def enrich_grouped_records(
                 grouped.remove(item)
                 break
 
-            ref = vaccine_reference.get(vaccine, vaccine)
-            if isinstance(ref, list):
-                diseases.extend(ref)
+            else:
+                ref = vaccine_reference.get(vaccine, vaccine)
+                if isinstance(ref, list):
+                    diseases.extend(ref)
 
 
-        enriched.append(
-            {
-                "date_given": item["date_given"],
-                "vaccine": vaccines,
-                "diseases": diseases,
-            }
+                enriched.append(
+                    {
+                        "date_given": item["date_given"],
+                        "vaccine": vaccines,
+                        "diseases": diseases,
+                    }
         )
 
     return enriched

From 68eb4b8559e873828d29510d5b45e90701fed1e0 Mon Sep 17 00:00:00 2001
From: kassyray <kassy.raymond@wdgpublichealth.ca>
Date: Tue, 18 Nov 2025 20:27:16 +0000
Subject: [PATCH 03/10] Update filtering so ignored diseases are removed
 individually, not whole vaccine entries.

---
 pipeline/orchestrator.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/pipeline/orchestrator.py b/pipeline/orchestrator.py
index 03c1eaf..5395c68 100755
--- a/pipeline/orchestrator.py
+++ b/pipeline/orchestrator.py
@@ -238,18 +238,26 @@ def run_step_2_preprocess(
     vaccine_reference_path = preprocess.VACCINE_REFERENCE_PATH
     vaccine_reference = json.loads(vaccine_reference_path.read_text(encoding="utf-8"))
 
-    # Filter out vaccines that include any ignored disease
     if ignore_diseases:
         ignore_set = set(ignore_diseases)
 
-        filtered_reference = {
-            vaccine: agents
-            for vaccine, agents in vaccine_reference.items()
-            if ignore_set.isdisjoint(agents)
-        }
+        filtered_reference = {}
+
+        for vaccine, agents in vaccine_reference.items():
+
+            # Remove ignored diseases from agent list
+            cleaned_agents = [a for a in agents if a not in ignore_set]
+
+            # Keep only if something remains
+            if cleaned_agents:
+                filtered_reference[vaccine] = cleaned_agents
+
         print(f"Ignored diseases: {', '.join(sorted(ignore_set))}")
-        print(f"Filtered vaccine reference to {len(filtered_reference)} vaccines "
-              f"from {len(vaccine_reference)} total.")
+        print(
+            f"Filtered vaccine reference to {len(filtered_reference)} vaccines "
+            f"from {len(vaccine_reference)} total."
+        )
+
     else:
         filtered_reference = vaccine_reference
 

From 3bade2d81753adb3fa4d87853c9552b7ea9c7bbd Mon Sep 17 00:00:00 2001
From: kassyray <kassy.raymond@wdgpublichealth.ca>
Date: Wed, 19 Nov 2025 20:49:19 +0000
Subject: [PATCH 04/10] Adding tests

---
 tests/unit/test_preprocess.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tests/unit/test_preprocess.py b/tests/unit/test_preprocess.py
index 471ff23..7c5a643 100644
--- a/tests/unit/test_preprocess.py
+++ b/tests/unit/test_preprocess.py
@@ -483,7 +483,7 @@ def test_build_result_generates_clients_with_sequences(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            replace_unspecified=[],
+            ignore_agents=[],
         )
 
         assert len(result.clients) == 3
@@ -508,14 +508,14 @@ def test_build_result_sorts_clients_deterministically(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            replace_unspecified=[],
+            ignore_agents=[],
         )
 
         result2 = preprocess.build_preprocess_result(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            replace_unspecified=[],
+            ignore_agents=[],
         )
 
         ids1 = [c.client_id for c in result1.clients]
@@ -569,7 +569,7 @@ def test_build_result_sorts_by_school_then_name(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            replace_unspecified=[],
+            ignore_agents=[],
         )
 
         # Expected order: Apple/Chloe/Jones, Apple/Diana/Jones, Zebra/Alice/Smith, Zebra/Bob/Smith
@@ -595,7 +595,7 @@ def test_build_result_maps_vaccines_correctly(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            replace_unspecified=[],
+            ignore_diseases=[],
         )
 
         # Should have DTaP expanded to component diseases
@@ -637,7 +637,7 @@ def test_build_result_handles_missing_board_name_with_warning(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            replace_unspecified=[],
+            ignore_agents=[],
         )
 
         # Should still process - at least one client
@@ -660,16 +660,16 @@ def test_build_result_french_language_support(
             normalized,
             language="fr",
             vaccine_reference=default_vaccine_reference,
-            replace_unspecified=[],
+            ignore_agents=[],
         )
 
         assert len(result.clients) == 1
         assert result.clients[0].language == "fr"
 
-    def test_build_result_handles_replace_unspecified(
+    def test_build_result_handles_ignore_agents(
         self, default_vaccine_reference
     ) -> None:
-        """Verify replace_unspecified filters out unspecified vaccines.
+        """Verify ignore_agents filters out unspecified vaccines.
 
         Real-world significance:
         - Input may contain "Not Specified" vaccine agents
@@ -682,7 +682,7 @@ def test_build_result_handles_replace_unspecified(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            replace_unspecified=["Not Specified", "unspecified"],
+            ignore_agents=["Not Specified", "unspecified"],
         )
 
         assert len(result.clients) == 1
@@ -708,7 +708,7 @@ def test_build_result_detects_duplicate_client_ids(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            replace_unspecified=[],
+            ignore_agents=[],
         )
 
         # Should have 2 clients (no deduplication)
@@ -744,7 +744,7 @@ def test_build_result_detects_multiple_duplicate_client_ids(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            replace_unspecified=[],
+            ignore_agents=[],
         )
 
         # Should have 5 clients (no deduplication)
@@ -776,7 +776,7 @@ def test_build_result_no_warning_for_unique_client_ids(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            replace_unspecified=[],
+            ignore_agents=[],
         )
 
         # Should have 3 unique clients

From 07b1de8631e192e65dfcce5202ef4aba4e562cbd Mon Sep 17 00:00:00 2001
From: kassyray <kassy.raymond@wdgpublichealth.ca>
Date: Wed, 19 Nov 2025 20:54:07 +0000
Subject: [PATCH 05/10] Adding tests

---
 tests/unit/test_preprocess.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tests/unit/test_preprocess.py b/tests/unit/test_preprocess.py
index 7c5a643..74f4554 100644
--- a/tests/unit/test_preprocess.py
+++ b/tests/unit/test_preprocess.py
@@ -483,7 +483,7 @@ def test_build_result_generates_clients_with_sequences(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            ignore_agents=[],
+            ignore_diseases=[],
         )
 
         assert len(result.clients) == 3
@@ -508,14 +508,14 @@ def test_build_result_sorts_clients_deterministically(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            ignore_agents=[],
+            ignore_diseases=[],
         )
 
         result2 = preprocess.build_preprocess_result(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            ignore_agents=[],
+            ignore_diseases=[],
         )
 
         ids1 = [c.client_id for c in result1.clients]
@@ -569,7 +569,7 @@ def test_build_result_sorts_by_school_then_name(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            ignore_agents=[],
+            ignore_diseases=[],
         )
 
         # Expected order: Apple/Chloe/Jones, Apple/Diana/Jones, Zebra/Alice/Smith, Zebra/Bob/Smith
@@ -637,7 +637,7 @@ def test_build_result_handles_missing_board_name_with_warning(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            ignore_agents=[],
+            ignore_diseases=[],
         )
 
         # Should still process - at least one client
@@ -660,7 +660,7 @@ def test_build_result_french_language_support(
             normalized,
             language="fr",
             vaccine_reference=default_vaccine_reference,
-            ignore_agents=[],
+            ignore_diseases=[],
         )
 
         assert len(result.clients) == 1
@@ -669,7 +669,7 @@ def test_build_result_french_language_support(
     def test_build_result_handles_ignore_agents(
         self, default_vaccine_reference
     ) -> None:
-        """Verify ignore_agents filters out unspecified vaccines.
+        """Verify ignore_diseases filters out unspecified vaccines.
 
         Real-world significance:
         - Input may contain "Not Specified" vaccine agents
@@ -682,7 +682,7 @@ def test_build_result_handles_ignore_agents(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            ignore_agents=["Not Specified", "unspecified"],
+            ignore_diseases=["Not Specified", "unspecified"],
         )
 
         assert len(result.clients) == 1
@@ -708,7 +708,7 @@ def test_build_result_detects_duplicate_client_ids(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            ignore_agents=[],
+            ignore_diseases=[],
         )
 
         # Should have 2 clients (no deduplication)
@@ -744,7 +744,7 @@ def test_build_result_detects_multiple_duplicate_client_ids(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            ignore_agents=[],
+            ignore_diseases=[],
         )
 
         # Should have 5 clients (no deduplication)
@@ -776,7 +776,7 @@ def test_build_result_no_warning_for_unique_client_ids(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            ignore_agents=[],
+            ignore_diseases=[],
         )
 
         # Should have 3 unique clients

From 162c861ca15becdb625826cd747c0f2db2ac28c2 Mon Sep 17 00:00:00 2001
From: kassyray <kassy.raymond@wdgpublichealth.ca>
Date: Wed, 19 Nov 2025 21:00:20 +0000
Subject: [PATCH 06/10] Adding tests

---
 tests/unit/test_preprocess.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tests/unit/test_preprocess.py b/tests/unit/test_preprocess.py
index 74f4554..c40335c 100644
--- a/tests/unit/test_preprocess.py
+++ b/tests/unit/test_preprocess.py
@@ -483,6 +483,7 @@ def test_build_result_generates_clients_with_sequences(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
+            replace_unspecified=[],
             ignore_diseases=[],
         )
 
@@ -508,6 +509,7 @@ def test_build_result_sorts_clients_deterministically(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
+            replace_unspecified=[],
             ignore_diseases=[],
         )
 
@@ -569,6 +571,7 @@ def test_build_result_sorts_by_school_then_name(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
+            replace_unspecified=[],
             ignore_diseases=[],
         )
 
@@ -595,6 +598,7 @@ def test_build_result_maps_vaccines_correctly(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
+            replace_unspecified=[],
             ignore_diseases=[],
         )
 
@@ -637,6 +641,7 @@ def test_build_result_handles_missing_board_name_with_warning(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
+            replace_unspecified=[],
             ignore_diseases=[],
         )
 
@@ -660,6 +665,7 @@ def test_build_result_french_language_support(
             normalized,
             language="fr",
             vaccine_reference=default_vaccine_reference,
+            replace_unspecified=[],
             ignore_diseases=[],
         )
 
@@ -682,6 +688,7 @@ def test_build_result_handles_ignore_agents(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
+            replce_unspecified=[],
             ignore_diseases=["Not Specified", "unspecified"],
         )
 
@@ -708,6 +715,7 @@ def test_build_result_detects_duplicate_client_ids(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
+            replace_unspecified=[],
             ignore_diseases=[],
         )
 
@@ -744,6 +752,7 @@ def test_build_result_detects_multiple_duplicate_client_ids(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
+            replace_unspecified=[],
             ignore_diseases=[],
         )
 
@@ -776,6 +785,7 @@ def test_build_result_no_warning_for_unique_client_ids(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
+            replace_unspecified=[],
             ignore_diseases=[],
         )
 

From 4cbb36b97c689b5a6896333cae7cd14405f31ab7 Mon Sep 17 00:00:00 2001
From: kassyray <kassy.raymond@wdgpublichealth.ca>
Date: Wed, 19 Nov 2025 21:04:38 +0000
Subject: [PATCH 07/10] Adding tests

---
 tests/unit/test_preprocess.py   | 3 ++-
 tests/unit/test_run_pipeline.py | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/unit/test_preprocess.py b/tests/unit/test_preprocess.py
index c40335c..7d47044 100644
--- a/tests/unit/test_preprocess.py
+++ b/tests/unit/test_preprocess.py
@@ -517,6 +517,7 @@ def test_build_result_sorts_clients_deterministically(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
+            replace_unspecified=[],
             ignore_diseases=[],
         )
 
@@ -688,7 +689,7 @@ def test_build_result_handles_ignore_agents(
             normalized,
             language="en",
             vaccine_reference=default_vaccine_reference,
-            replce_unspecified=[],
+            replace_unspecified=[],
             ignore_diseases=["Not Specified", "unspecified"],
         )
 
diff --git a/tests/unit/test_run_pipeline.py b/tests/unit/test_run_pipeline.py
index dff22a8..8ca9231 100644
--- a/tests/unit/test_run_pipeline.py
+++ b/tests/unit/test_run_pipeline.py
@@ -236,6 +236,7 @@ def test_run_step_2_preprocess(self, tmp_test_dir: Path, tmp_output_structure: d
                 output_dir=tmp_output_structure["root"],
                 language="en",
                 run_id="test_20250101_120000",
+                config_dir=tmp_test_dir,
             )
 
         assert total == 2

From 2b5e68e2b5fea172a9116dc59a3f7385e25b08bc Mon Sep 17 00:00:00 2001
From: kassyray <kassy.raymond@wdgpublichealth.ca>
Date: Wed, 19 Nov 2025 21:18:17 +0000
Subject: [PATCH 08/10] Adding tests

---
 tests/unit/test_run_pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/test_run_pipeline.py b/tests/unit/test_run_pipeline.py
index 8ca9231..10dcc73 100644
--- a/tests/unit/test_run_pipeline.py
+++ b/tests/unit/test_run_pipeline.py
@@ -208,7 +208,7 @@ def test_run_step_1_prepare_output_user_cancels(
             )
             assert result is False
 
-    def test_run_step_2_preprocess(self, tmp_test_dir: Path, tmp_output_structure: dict) -> None:
+    def test_run_step_2_preprocess(self, tmp_test_dir: Path, tmp_output_structure: dict, config_file: Path) -> None:
         """Verify Step 2: preprocess returns client count."""
         mock_df = MagicMock()
         mock_mapped_df = MagicMock()
@@ -236,7 +236,7 @@ def test_run_step_2_preprocess(self, tmp_test_dir: Path, tmp_output_structure: d
                 output_dir=tmp_output_structure["root"],
                 language="en",
                 run_id="test_20250101_120000",
-                config_dir=tmp_test_dir,
+                config_dir=config_file.parent,
             )
 
         assert total == 2

From f3d1bc65793117528b631ceac4c828786a727cbe Mon Sep 17 00:00:00 2001
From: kassyray <kassy.raymond@wdgpublichealth.ca>
Date: Fri, 21 Nov 2025 15:38:56 +0000
Subject: [PATCH 09/10] Adding a patch to have HPV and HepB vaxes map to the
 other col

---
 pipeline/preprocess.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pipeline/preprocess.py b/pipeline/preprocess.py
index 9f217a6..eda11a7 100644
--- a/pipeline/preprocess.py
+++ b/pipeline/preprocess.py
@@ -661,7 +661,6 @@ def enrich_grouped_records(
     are always removed.
     """
 
-    print(grouped)
 
     enriched: List[Dict[str, Any]] = []
 
@@ -689,6 +688,10 @@ def enrich_grouped_records(
                 if isinstance(ref, list):
                     diseases.extend(ref)
 
+                if "Hepatitis B" in diseases or "HPV" in diseases:
+                    print("Found Hepatitis B or HPV in diseases")
+                    print(ref)
+                    diseases.extend(["Other"])
 
                 enriched.append(
                     {

From 73ef22806bbd67fc9b4bca5affee08e44ea77a27 Mon Sep 17 00:00:00 2001
From: kassyray <kassy.raymond@wdgpublichealth.ca>
Date: Thu, 27 Nov 2025 16:06:21 +0000
Subject: [PATCH 10/10] Adding functionality to remove non-ispa diseases
 besides hpv and hepb from the immunization histories of clients. This needs
 to be an optional request and should not yet be pushed to main

---
 config/parameters.yaml  |   4 +-
 phu_templates/.gitkeep  |   0
 phu_templates/README.md | 111 ++++++++++++----------------------------
 pipeline/preprocess.py  |  90 ++++++++++++++++++--------------
 templates/conf.typ      |   2 +-
 5 files changed, 88 insertions(+), 119 deletions(-)
 delete mode 100644 phu_templates/.gitkeep

diff --git a/config/parameters.yaml b/config/parameters.yaml
index 5a76b33..7774932 100644
--- a/config/parameters.yaml
+++ b/config/parameters.yaml
@@ -1,6 +1,6 @@
 bundling:
   bundle_size: 100
-  group_by: null
+  group_by: school
 chart_diseases_header:
 - Diphtheria
 - Tetanus
@@ -21,6 +21,8 @@ encryption:
   enabled: false
   password:
     template: '{date_of_birth_iso_compact}'
+ignore_diseases: 
+- Other
 ignore_agents:
 - RSVAb
 - VarIg
diff --git a/phu_templates/.gitkeep b/phu_templates/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/phu_templates/README.md b/phu_templates/README.md
index 65309f5..c75c704 100644
--- a/phu_templates/README.md
+++ b/phu_templates/README.md
@@ -1,103 +1,56 @@
-# PHU Templates Directory
+# WDGPH VIPER Template Library
 
-This directory contains Public Health Unit (PHU) specific template customizations.
+The VIPER Template Library is a private repository that provides a centralized, version-controlled collection of reusable templates that are used across provinical Public Health VIPER workflows. 
 
-## Usage
+This includes Typst document templates, YAML configuration schemas, and other standardized assets reuqired by the VIPER pipeline.
 
-Each PHU should create a subdirectory here with their organization-specific templates:
+By storing these artifacts in a dedicated private repostiroy and consuming them as a Git submodule, we ensure the following: 
 
-```
-phu_templates/
-├── my_phu/
-│   ├── en_template.py              (required for English output)
-│   ├── fr_template.py              (required for French output)
-│   ├── conf.typ                    (required)
-│   └── assets/                     (optional - only if templates reference assets)
-│       ├── logo.png                (optional)
-│       └── signature.png           (optional)
-```
-
-## Running with PHU Templates
+* Reproducibiltiy: downstream pipelines reference an exact, pinned version
+* Consistency: all PHUs use the same reviewed and approved templates
+* Governance: updates are traceable, reviewable, and versioned
+* Modularity: templates remain independent from pipeline logic
+* Security: PHU-specific assets remain internal and controlled
 
-To use a PHU-specific template, specify the template name with `--template`:
+This repository is intended for **internal** use and is included as a **submodule** inside downstream pipelienes under: 
 
-```bash
-# Generate English notices
-uv run viper students.xlsx en --template my_phu
-
-# Generate French notices
-uv run viper students.xlsx fr --template my_phu
+```
+/pipeline/phu_templates
 ```
 
-This will load templates from `phu_templates/my_phu/`.
-
-## Template File Requirements
-
-### Core Requirements (Always Required)
-
-- `conf.typ` - Typst configuration and utility functions
-
-### Language-Specific Requirements (Based on Output Language)
-
-- `en_template.py` - Required only if generating English notices (`--language en`)
-  - Must define `render_notice()` function
-  - Consulted only when `--language en` is specified
-  
-- `fr_template.py` - Required only if generating French notices (`--language fr`)
-  - Must define `render_notice()` function
-  - Consulted only when `--language fr` is specified
-
-**Note:** A PHU may provide templates for only one language. If a user requests a language your template does not support, the pipeline will fail with a clear error message. If you only support one language, only include that template file (e.g., only `en_template.py`).
-
-### Asset Requirements (Based on Template Implementation)
-
-Assets in the `assets/` directory are **optional** and depend entirely on your template implementation:
-
-- `assets/logo.png` - Only required if your `en_template.py` or `fr_template.py` references a logo
-- `assets/signature.png` - Only required if your `en_template.py` or `fr_template.py` references a signature
-- Other files - Any additional assets (e.g., `assets/header.png`, `assets/seal.pdf`) may be included and referenced in your templates
+Pipeline developers are encouraged to update the submodule when template changes are required, ensuring changes are reviewed and versioned before integration.
 
-**Note:** If your template references an asset (e.g., `include "assets/logo.png"` in Typst), that asset **must** exist. The pipeline will fail with a clear error if a referenced asset is missing.
+## Directory Structure
 
-## Creating a PHU Template
+Templates are stored inside a folder named using the standard PHU acronym. 
 
-If your PHU supports both English and French:
+Within each PHU folder, templates and assets can follow the following structure: 
 
-```bash
-cp -r templates/ phu_templates/my_phu/
+```
+<PHU_ACRONYM>/
+├── assets/
+│   ├── logo.png
+│   └── signature.png
+├──en_template.py
+├──fr_template.py
 ```
 
-Then customize:
-- Replace `assets/logo.png` with your PHU logo
-- Replace `assets/signature.png` with your signature
-- Modify `en_template.py` and `fr_template.py` as needed
-- Adjust `conf.typ` for organization-specific styling
+## Using the Submodule
 
-### Testing Your Template
+Downstream VIPER pipelines consume this repository as a Git submodule, allowing each pipeline to reference a specific, version-controlled snapshot of the templates.
 
-```bash
-# Test English generation
-uv run viper students.xlsx en --template my_phu
+### Adding the Submodule (Initial Setup)
 
-# Test French generation (if you provided fr_template.py)
-uv run viper students.xlsx fr --template my_phu
-```
+If the pipeline does not yet include the template library:
 
-If a language template is missing:
 ```
-FileNotFoundError: Template module not found: /path/to/phu_templates/my_phu/fr_template.py
-Expected fr_template.py in /path/to/phu_templates/my_phu
+git submodule add <PRIVATE_REPO_URL> phu_templates
+git submodule update --init --recursive
 ```
 
-If an asset referenced by your template is missing:
+This will create:
+
 ```
-FileNotFoundError: Logo not found: /path/to/phu_templates/my_phu/assets/logo.png
+phu_templates/   # Points to this template library
 ```
 
-## Git Considerations
-
-**Important:** PHU-specific templates are excluded from version control via `.gitignore`.
-
-- Templates in this directory will NOT be committed to the main repository
-- Each PHU should maintain their templates in their own fork or separate repository
-- The `README.md` file and `.gitkeep` are the only tracked files in this directory
\ No newline at end of file
diff --git a/pipeline/preprocess.py b/pipeline/preprocess.py
index eda11a7..e3834d3 100644
--- a/pipeline/preprocess.py
+++ b/pipeline/preprocess.py
@@ -654,51 +654,65 @@ def enrich_grouped_records(
     chart_diseases_header: List[str] | None = None,
     ignore_diseases: List[str] | None = None,
 ) -> List[Dict[str, Any]]:
-    """Enrich grouped vaccine records with disease information.
 
-    If chart_diseases_header is provided, diseases not in the list are
-    collapsed into the "Other" category. Diseases in ignore_diseases
-    are always removed.
-    """
+    enriched = []
+    ignore_set = set(ignore_diseases or [])
 
+    for item in grouped:
 
-    enriched: List[Dict[str, Any]] = []
+        # Rule 1 — If ANY vaccine does not exist, drop entire row
+        if any(v not in vaccine_reference for v in item["vaccine"]):
+            continue
 
-    for item in grouped:
-        # Normalize vaccine names
-        vaccines = [
-            v.replace("-unspecified", "*").replace(" unspecified", "*")
-            for v in item["vaccine"]
+        final_vaccines = []
+        final_diseases = []
+
+        for vaccine in item["vaccine"]:
+
+            disease_mapping = vaccine_reference.get(vaccine)
+
+            # Rule 2 — If mapping is "Other", skip this vaccine entirely
+            if disease_mapping == ["Other"] or disease_mapping == "Other":
+                continue
+
+            # Add vaccine to valid list
+            final_vaccines.append(vaccine)
+
+            # Add disease mapping
+            if isinstance(disease_mapping, list):
+                final_diseases.extend(disease_mapping)
+
+                # Rule 3 — HepB or HPV → also append "Other"
+                if any(d in ["Hepatitis B", "HPV"] for d in disease_mapping):
+                    final_diseases.append("Other")
+
+        # If no vaccines remain, drop entire row
+        if not final_vaccines:
+            continue
+
+        # Rule 4 — Remove ignored diseases
+        final_diseases = [d for d in final_diseases if d not in ignore_set]
+
+        # Rule 5 — Collapse diseases not in header → "Other"
+        if chart_diseases_header:
+            allowed = set(chart_diseases_header)
+            final_diseases = [
+                d if d in allowed else "Other"
+                for d in final_diseases
+            ]
+        
+        vaccines_normalized = [
+        v.replace("-unspecified", "*").replace(" unspecified", "*")
+        for v in final_vaccines
         ]
 
-        # Lookup diseases
-        diseases: List[str] = []
-        for vaccine in vaccines:
-            # See if the vaccine has a mapping to diseases
-            has_mapping = vaccine in vaccine_reference
-            if not has_mapping:
-                # Do not add to list
-                # Remove from vaccines list
-                # Remove whole item from grouped if no vaccines left
-                grouped.remove(item)
-                break
 
-            else:
-                ref = vaccine_reference.get(vaccine, vaccine)
-                if isinstance(ref, list):
-                    diseases.extend(ref)
-
-                if "Hepatitis B" in diseases or "HPV" in diseases:
-                    print("Found Hepatitis B or HPV in diseases")
-                    print(ref)
-                    diseases.extend(["Other"])
-
-                enriched.append(
-                    {
-                        "date_given": item["date_given"],
-                        "vaccine": vaccines,
-                        "diseases": diseases,
-                    }
+        enriched.append(
+            {
+                "date_given": item["date_given"],
+                "vaccine": vaccines_normalized,
+                "diseases": final_diseases,
+            }
         )
 
     return enriched
diff --git a/templates/conf.typ b/templates/conf.typ
index 9b23ccb..fe30b29 100644
--- a/templates/conf.typ
+++ b/templates/conf.typ
@@ -61,7 +61,7 @@
   let table_content = align(center)[
     #table(
       columns: columns,
-      rows: (envelope_window_height),
+      rows: auto,
       inset: font_size,
       col1_content,
       table.vline(stroke: vline_stroke),