From 438a3c10cc33732fabb872fa8d7dd71468d976cf Mon Sep 17 00:00:00 2001 From: kassyray Date: Tue, 18 Nov 2025 20:00:09 +0000 Subject: [PATCH 01/10] This almost works but the date is still showing up. Committing as a roll back point --- pipeline/preprocess.py | 55 +++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/pipeline/preprocess.py b/pipeline/preprocess.py index 6209d54..94f2e8a 100644 --- a/pipeline/preprocess.py +++ b/pipeline/preprocess.py @@ -652,55 +652,42 @@ def enrich_grouped_records( vaccine_reference: Dict[str, Any], language: str, chart_diseases_header: List[str] | None = None, + ignore_diseases: List[str] | None = None, ) -> List[Dict[str, Any]]: """Enrich grouped vaccine records with disease information. If chart_diseases_header is provided, diseases not in the list are - collapsed into the "Other" category. + collapsed into the "Other" category. Diseases in ignore_diseases + are always removed. + """ - Parameters - ---------- - grouped : List[Dict[str, Any]] - Grouped vaccine records with date_given and vaccine list. - vaccine_reference : Dict[str, Any] - Map of vaccine codes to disease names. - language : str - Language code for logging. - chart_diseases_header : List[str], optional - List of diseases to include in chart. Diseases not in this list - are mapped to "Other". + print(grouped) - Returns - ------- - List[Dict[str, Any]] - Enriched records with date_given, vaccine, and diseases fields. - """ enriched: List[Dict[str, Any]] = [] + for item in grouped: + # Normalize vaccine names vaccines = [ v.replace("-unspecified", "*").replace(" unspecified", "*") for v in item["vaccine"] ] + + # Lookup diseases diseases: List[str] = [] for vaccine in vaccines: + # See if the vaccine has a mapping to diseases + has_mapping = vaccine in vaccine_reference + if not has_mapping: + # Do not add to list + # Remove from vaccines list + # Remove whole item from grouped if no vaccines left + grouped.remove(item) + break + ref = vaccine_reference.get(vaccine, vaccine) if isinstance(ref, list): diseases.extend(ref) - else: - diseases.append(ref) - - # Collapse diseases not in chart to "Other" - if chart_diseases_header: - filtered_diseases: List[str] = [] - has_unmapped = False - for disease in diseases: - if disease in chart_diseases_header: - filtered_diseases.append(disease) - else: - has_unmapped = True - if has_unmapped and "Other" not in filtered_diseases: - filtered_diseases.append("Other") - diseases = filtered_diseases + enriched.append( { @@ -709,6 +696,7 @@ def enrich_grouped_records( "diseases": diseases, } ) + return enriched @@ -717,6 +705,7 @@ def build_preprocess_result( language: str, vaccine_reference: Dict[str, Any], replace_unspecified: List[str], + ignore_diseases ) -> PreprocessResult: """Process and normalize client data into structured artifact. @@ -790,7 +779,7 @@ def build_preprocess_result( ] received_grouped = process_received_agents(row.IMMS_GIVEN, replace_unspecified) # type: ignore[attr-defined] received = enrich_grouped_records( - received_grouped, vaccine_reference, language, chart_diseases_header + received_grouped, vaccine_reference, language, chart_diseases_header, ignore_diseases=None ) postal_code = row.POSTAL_CODE if row.POSTAL_CODE else "Not provided" # type: ignore[attr-defined] From 65362a7b150aa66bb2b4c2aca3a16cce32dea8fa Mon Sep 17 00:00:00 2001 From: kassyray Date: Tue, 18 Nov 2025 20:03:35 +0000 Subject: [PATCH 02/10] Another checkpoint. This is working but there is some erroneous behaviour when ignore diseases that have multiple mappings. --- pipeline/orchestrator.py | 22 +++++++++++++++++++++- pipeline/preprocess.py | 19 ++++++++++--------- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/pipeline/orchestrator.py b/pipeline/orchestrator.py index 9517c6f..03c1eaf 100755 --- a/pipeline/orchestrator.py +++ b/pipeline/orchestrator.py @@ -209,6 +209,7 @@ def run_step_2_preprocess( output_dir: Path, language: str, run_id: str, + config_dir: Path, ) -> int: """Step 2: Preprocessing. @@ -231,12 +232,30 @@ def run_step_2_preprocess( df = preprocess.check_addresses_complete(df) # Load configuration + config = load_config(config_dir / "parameters.yaml") + ignore_diseases = config.get("ignore_diseases", {}) + vaccine_reference_path = preprocess.VACCINE_REFERENCE_PATH vaccine_reference = json.loads(vaccine_reference_path.read_text(encoding="utf-8")) + # Filter out vaccines that include any ignored disease + if ignore_diseases: + ignore_set = set(ignore_diseases) + + filtered_reference = { + vaccine: agents + for vaccine, agents in vaccine_reference.items() + if ignore_set.isdisjoint(agents) + } + print(f"Ignored diseases: {', '.join(sorted(ignore_set))}") + print(f"Filtered vaccine reference to {len(filtered_reference)} vaccines " + f"from {len(vaccine_reference)} total.") + else: + filtered_reference = vaccine_reference + # Build preprocessing result result = preprocess.build_preprocess_result( - df, language, vaccine_reference, preprocess.REPLACE_UNSPECIFIED + df, language, filtered_reference, preprocess.REPLACE_UNSPECIFIED, ignore_diseases ) # Write artifact @@ -574,6 +593,7 @@ def main() -> int: output_dir, args.language, run_id, + config_dir ) step_duration = time.time() - step_start step_times.append(("Preprocessing", step_duration)) diff --git a/pipeline/preprocess.py b/pipeline/preprocess.py index 94f2e8a..9f217a6 100644 --- a/pipeline/preprocess.py +++ b/pipeline/preprocess.py @@ -684,17 +684,18 @@ def enrich_grouped_records( grouped.remove(item) break - ref = vaccine_reference.get(vaccine, vaccine) - if isinstance(ref, list): - diseases.extend(ref) + else: + ref = vaccine_reference.get(vaccine, vaccine) + if isinstance(ref, list): + diseases.extend(ref) - enriched.append( - { - "date_given": item["date_given"], - "vaccine": vaccines, - "diseases": diseases, - } + enriched.append( + { + "date_given": item["date_given"], + "vaccine": vaccines, + "diseases": diseases, + } ) return enriched From 68eb4b8559e873828d29510d5b45e90701fed1e0 Mon Sep 17 00:00:00 2001 From: kassyray Date: Tue, 18 Nov 2025 20:27:16 +0000 Subject: [PATCH 03/10] Update filtering so ignored diseases are removed individually, not whole vaccine entries. --- pipeline/orchestrator.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/pipeline/orchestrator.py b/pipeline/orchestrator.py index 03c1eaf..5395c68 100755 --- a/pipeline/orchestrator.py +++ b/pipeline/orchestrator.py @@ -238,18 +238,26 @@ def run_step_2_preprocess( vaccine_reference_path = preprocess.VACCINE_REFERENCE_PATH vaccine_reference = json.loads(vaccine_reference_path.read_text(encoding="utf-8")) - # Filter out vaccines that include any ignored disease if ignore_diseases: ignore_set = set(ignore_diseases) - filtered_reference = { - vaccine: agents - for vaccine, agents in vaccine_reference.items() - if ignore_set.isdisjoint(agents) - } + filtered_reference = {} + + for vaccine, agents in vaccine_reference.items(): + + # Remove ignored diseases from agent list + cleaned_agents = [a for a in agents if a not in ignore_set] + + # Keep only if something remains + if cleaned_agents: + filtered_reference[vaccine] = cleaned_agents + print(f"Ignored diseases: {', '.join(sorted(ignore_set))}") - print(f"Filtered vaccine reference to {len(filtered_reference)} vaccines " - f"from {len(vaccine_reference)} total.") + print( + f"Filtered vaccine reference to {len(filtered_reference)} vaccines " + f"from {len(vaccine_reference)} total." + ) + else: filtered_reference = vaccine_reference From 3bade2d81753adb3fa4d87853c9552b7ea9c7bbd Mon Sep 17 00:00:00 2001 From: kassyray Date: Wed, 19 Nov 2025 20:49:19 +0000 Subject: [PATCH 04/10] Adding tests --- tests/unit/test_preprocess.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/unit/test_preprocess.py b/tests/unit/test_preprocess.py index 471ff23..7c5a643 100644 --- a/tests/unit/test_preprocess.py +++ b/tests/unit/test_preprocess.py @@ -483,7 +483,7 @@ def test_build_result_generates_clients_with_sequences( normalized, language="en", vaccine_reference=default_vaccine_reference, - replace_unspecified=[], + ignore_agents=[], ) assert len(result.clients) == 3 @@ -508,14 +508,14 @@ def test_build_result_sorts_clients_deterministically( normalized, language="en", vaccine_reference=default_vaccine_reference, - replace_unspecified=[], + ignore_agents=[], ) result2 = preprocess.build_preprocess_result( normalized, language="en", vaccine_reference=default_vaccine_reference, - replace_unspecified=[], + ignore_agents=[], ) ids1 = [c.client_id for c in result1.clients] @@ -569,7 +569,7 @@ def test_build_result_sorts_by_school_then_name( normalized, language="en", vaccine_reference=default_vaccine_reference, - replace_unspecified=[], + ignore_agents=[], ) # Expected order: Apple/Chloe/Jones, Apple/Diana/Jones, Zebra/Alice/Smith, Zebra/Bob/Smith @@ -595,7 +595,7 @@ def test_build_result_maps_vaccines_correctly( normalized, language="en", vaccine_reference=default_vaccine_reference, - replace_unspecified=[], + ignore_diseases=[], ) # Should have DTaP expanded to component diseases @@ -637,7 +637,7 @@ def test_build_result_handles_missing_board_name_with_warning( normalized, language="en", vaccine_reference=default_vaccine_reference, - replace_unspecified=[], + ignore_agents=[], ) # Should still process - at least one client @@ -660,16 +660,16 @@ def test_build_result_french_language_support( normalized, language="fr", vaccine_reference=default_vaccine_reference, - replace_unspecified=[], + ignore_agents=[], ) assert len(result.clients) == 1 assert result.clients[0].language == "fr" - def test_build_result_handles_replace_unspecified( + def test_build_result_handles_ignore_agents( self, default_vaccine_reference ) -> None: - """Verify replace_unspecified filters out unspecified vaccines. + """Verify ignore_agents filters out unspecified vaccines. Real-world significance: - Input may contain "Not Specified" vaccine agents @@ -682,7 +682,7 @@ def test_build_result_handles_replace_unspecified( normalized, language="en", vaccine_reference=default_vaccine_reference, - replace_unspecified=["Not Specified", "unspecified"], + ignore_agents=["Not Specified", "unspecified"], ) assert len(result.clients) == 1 @@ -708,7 +708,7 @@ def test_build_result_detects_duplicate_client_ids( normalized, language="en", vaccine_reference=default_vaccine_reference, - replace_unspecified=[], + ignore_agents=[], ) # Should have 2 clients (no deduplication) @@ -744,7 +744,7 @@ def test_build_result_detects_multiple_duplicate_client_ids( normalized, language="en", vaccine_reference=default_vaccine_reference, - replace_unspecified=[], + ignore_agents=[], ) # Should have 5 clients (no deduplication) @@ -776,7 +776,7 @@ def test_build_result_no_warning_for_unique_client_ids( normalized, language="en", vaccine_reference=default_vaccine_reference, - replace_unspecified=[], + ignore_agents=[], ) # Should have 3 unique clients From 07b1de8631e192e65dfcce5202ef4aba4e562cbd Mon Sep 17 00:00:00 2001 From: kassyray Date: Wed, 19 Nov 2025 20:54:07 +0000 Subject: [PATCH 05/10] Adding tests --- tests/unit/test_preprocess.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/unit/test_preprocess.py b/tests/unit/test_preprocess.py index 7c5a643..74f4554 100644 --- a/tests/unit/test_preprocess.py +++ b/tests/unit/test_preprocess.py @@ -483,7 +483,7 @@ def test_build_result_generates_clients_with_sequences( normalized, language="en", vaccine_reference=default_vaccine_reference, - ignore_agents=[], + ignore_diseases=[], ) assert len(result.clients) == 3 @@ -508,14 +508,14 @@ def test_build_result_sorts_clients_deterministically( normalized, language="en", vaccine_reference=default_vaccine_reference, - ignore_agents=[], + ignore_diseases=[], ) result2 = preprocess.build_preprocess_result( normalized, language="en", vaccine_reference=default_vaccine_reference, - ignore_agents=[], + ignore_diseases=[], ) ids1 = [c.client_id for c in result1.clients] @@ -569,7 +569,7 @@ def test_build_result_sorts_by_school_then_name( normalized, language="en", vaccine_reference=default_vaccine_reference, - ignore_agents=[], + ignore_diseases=[], ) # Expected order: Apple/Chloe/Jones, Apple/Diana/Jones, Zebra/Alice/Smith, Zebra/Bob/Smith @@ -637,7 +637,7 @@ def test_build_result_handles_missing_board_name_with_warning( normalized, language="en", vaccine_reference=default_vaccine_reference, - ignore_agents=[], + ignore_diseases=[], ) # Should still process - at least one client @@ -660,7 +660,7 @@ def test_build_result_french_language_support( normalized, language="fr", vaccine_reference=default_vaccine_reference, - ignore_agents=[], + ignore_diseases=[], ) assert len(result.clients) == 1 @@ -669,7 +669,7 @@ def test_build_result_french_language_support( def test_build_result_handles_ignore_agents( self, default_vaccine_reference ) -> None: - """Verify ignore_agents filters out unspecified vaccines. + """Verify ignore_diseases filters out unspecified vaccines. Real-world significance: - Input may contain "Not Specified" vaccine agents @@ -682,7 +682,7 @@ def test_build_result_handles_ignore_agents( normalized, language="en", vaccine_reference=default_vaccine_reference, - ignore_agents=["Not Specified", "unspecified"], + ignore_diseases=["Not Specified", "unspecified"], ) assert len(result.clients) == 1 @@ -708,7 +708,7 @@ def test_build_result_detects_duplicate_client_ids( normalized, language="en", vaccine_reference=default_vaccine_reference, - ignore_agents=[], + ignore_diseases=[], ) # Should have 2 clients (no deduplication) @@ -744,7 +744,7 @@ def test_build_result_detects_multiple_duplicate_client_ids( normalized, language="en", vaccine_reference=default_vaccine_reference, - ignore_agents=[], + ignore_diseases=[], ) # Should have 5 clients (no deduplication) @@ -776,7 +776,7 @@ def test_build_result_no_warning_for_unique_client_ids( normalized, language="en", vaccine_reference=default_vaccine_reference, - ignore_agents=[], + ignore_diseases=[], ) # Should have 3 unique clients From 162c861ca15becdb625826cd747c0f2db2ac28c2 Mon Sep 17 00:00:00 2001 From: kassyray Date: Wed, 19 Nov 2025 21:00:20 +0000 Subject: [PATCH 06/10] Adding tests --- tests/unit/test_preprocess.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/unit/test_preprocess.py b/tests/unit/test_preprocess.py index 74f4554..c40335c 100644 --- a/tests/unit/test_preprocess.py +++ b/tests/unit/test_preprocess.py @@ -483,6 +483,7 @@ def test_build_result_generates_clients_with_sequences( normalized, language="en", vaccine_reference=default_vaccine_reference, + replace_unspecified=[], ignore_diseases=[], ) @@ -508,6 +509,7 @@ def test_build_result_sorts_clients_deterministically( normalized, language="en", vaccine_reference=default_vaccine_reference, + replace_unspecified=[], ignore_diseases=[], ) @@ -569,6 +571,7 @@ def test_build_result_sorts_by_school_then_name( normalized, language="en", vaccine_reference=default_vaccine_reference, + replace_unspecified=[], ignore_diseases=[], ) @@ -595,6 +598,7 @@ def test_build_result_maps_vaccines_correctly( normalized, language="en", vaccine_reference=default_vaccine_reference, + replace_unspecified=[], ignore_diseases=[], ) @@ -637,6 +641,7 @@ def test_build_result_handles_missing_board_name_with_warning( normalized, language="en", vaccine_reference=default_vaccine_reference, + replace_unspecified=[], ignore_diseases=[], ) @@ -660,6 +665,7 @@ def test_build_result_french_language_support( normalized, language="fr", vaccine_reference=default_vaccine_reference, + replace_unspecified=[], ignore_diseases=[], ) @@ -682,6 +688,7 @@ def test_build_result_handles_ignore_agents( normalized, language="en", vaccine_reference=default_vaccine_reference, + replce_unspecified=[], ignore_diseases=["Not Specified", "unspecified"], ) @@ -708,6 +715,7 @@ def test_build_result_detects_duplicate_client_ids( normalized, language="en", vaccine_reference=default_vaccine_reference, + replace_unspecified=[], ignore_diseases=[], ) @@ -744,6 +752,7 @@ def test_build_result_detects_multiple_duplicate_client_ids( normalized, language="en", vaccine_reference=default_vaccine_reference, + replace_unspecified=[], ignore_diseases=[], ) @@ -776,6 +785,7 @@ def test_build_result_no_warning_for_unique_client_ids( normalized, language="en", vaccine_reference=default_vaccine_reference, + replace_unspecified=[], ignore_diseases=[], ) From 4cbb36b97c689b5a6896333cae7cd14405f31ab7 Mon Sep 17 00:00:00 2001 From: kassyray Date: Wed, 19 Nov 2025 21:04:38 +0000 Subject: [PATCH 07/10] Adding tests --- tests/unit/test_preprocess.py | 3 ++- tests/unit/test_run_pipeline.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_preprocess.py b/tests/unit/test_preprocess.py index c40335c..7d47044 100644 --- a/tests/unit/test_preprocess.py +++ b/tests/unit/test_preprocess.py @@ -517,6 +517,7 @@ def test_build_result_sorts_clients_deterministically( normalized, language="en", vaccine_reference=default_vaccine_reference, + replace_unspecified=[], ignore_diseases=[], ) @@ -688,7 +689,7 @@ def test_build_result_handles_ignore_agents( normalized, language="en", vaccine_reference=default_vaccine_reference, - replce_unspecified=[], + replace_unspecified=[], ignore_diseases=["Not Specified", "unspecified"], ) diff --git a/tests/unit/test_run_pipeline.py b/tests/unit/test_run_pipeline.py index dff22a8..8ca9231 100644 --- a/tests/unit/test_run_pipeline.py +++ b/tests/unit/test_run_pipeline.py @@ -236,6 +236,7 @@ def test_run_step_2_preprocess(self, tmp_test_dir: Path, tmp_output_structure: d output_dir=tmp_output_structure["root"], language="en", run_id="test_20250101_120000", + config_dir=tmp_test_dir, ) assert total == 2 From 2b5e68e2b5fea172a9116dc59a3f7385e25b08bc Mon Sep 17 00:00:00 2001 From: kassyray Date: Wed, 19 Nov 2025 21:18:17 +0000 Subject: [PATCH 08/10] Adding tests --- tests/unit/test_run_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_run_pipeline.py b/tests/unit/test_run_pipeline.py index 8ca9231..10dcc73 100644 --- a/tests/unit/test_run_pipeline.py +++ b/tests/unit/test_run_pipeline.py @@ -208,7 +208,7 @@ def test_run_step_1_prepare_output_user_cancels( ) assert result is False - def test_run_step_2_preprocess(self, tmp_test_dir: Path, tmp_output_structure: dict) -> None: + def test_run_step_2_preprocess(self, tmp_test_dir: Path, tmp_output_structure: dict, config_file: Path) -> None: """Verify Step 2: preprocess returns client count.""" mock_df = MagicMock() mock_mapped_df = MagicMock() @@ -236,7 +236,7 @@ def test_run_step_2_preprocess(self, tmp_test_dir: Path, tmp_output_structure: d output_dir=tmp_output_structure["root"], language="en", run_id="test_20250101_120000", - config_dir=tmp_test_dir, + config_dir=config_file.parent, ) assert total == 2 From f3d1bc65793117528b631ceac4c828786a727cbe Mon Sep 17 00:00:00 2001 From: kassyray Date: Fri, 21 Nov 2025 15:38:56 +0000 Subject: [PATCH 09/10] Adding a patch to have HPV and HepB vaxes map to the other col --- pipeline/preprocess.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pipeline/preprocess.py b/pipeline/preprocess.py index 9f217a6..eda11a7 100644 --- a/pipeline/preprocess.py +++ b/pipeline/preprocess.py @@ -661,7 +661,6 @@ def enrich_grouped_records( are always removed. """ - print(grouped) enriched: List[Dict[str, Any]] = [] @@ -689,6 +688,10 @@ def enrich_grouped_records( if isinstance(ref, list): diseases.extend(ref) + if "Hepatitis B" in diseases or "HPV" in diseases: + print("Found Hepatitis B or HPV in diseases") + print(ref) + diseases.extend(["Other"]) enriched.append( { From 73ef22806bbd67fc9b4bca5affee08e44ea77a27 Mon Sep 17 00:00:00 2001 From: kassyray Date: Thu, 27 Nov 2025 16:06:21 +0000 Subject: [PATCH 10/10] Adding functionality to remove non-ispa diseases besides hpv and hepb from the immunization histories of clients. This needs to be an optional request and should not yet be pushed to main --- config/parameters.yaml | 4 +- phu_templates/.gitkeep | 0 phu_templates/README.md | 111 ++++++++++++---------------------------- pipeline/preprocess.py | 90 ++++++++++++++++++-------------- templates/conf.typ | 2 +- 5 files changed, 88 insertions(+), 119 deletions(-) delete mode 100644 phu_templates/.gitkeep diff --git a/config/parameters.yaml b/config/parameters.yaml index 5a76b33..7774932 100644 --- a/config/parameters.yaml +++ b/config/parameters.yaml @@ -1,6 +1,6 @@ bundling: bundle_size: 100 - group_by: null + group_by: school chart_diseases_header: - Diphtheria - Tetanus @@ -21,6 +21,8 @@ encryption: enabled: false password: template: '{date_of_birth_iso_compact}' +ignore_diseases: +- Other ignore_agents: - RSVAb - VarIg diff --git a/phu_templates/.gitkeep b/phu_templates/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/phu_templates/README.md b/phu_templates/README.md index 65309f5..c75c704 100644 --- a/phu_templates/README.md +++ b/phu_templates/README.md @@ -1,103 +1,56 @@ -# PHU Templates Directory +# WDGPH VIPER Template Library -This directory contains Public Health Unit (PHU) specific template customizations. +The VIPER Template Library is a private repository that provides a centralized, version-controlled collection of reusable templates that are used across provinical Public Health VIPER workflows. -## Usage +This includes Typst document templates, YAML configuration schemas, and other standardized assets reuqired by the VIPER pipeline. -Each PHU should create a subdirectory here with their organization-specific templates: +By storing these artifacts in a dedicated private repostiroy and consuming them as a Git submodule, we ensure the following: -``` -phu_templates/ -├── my_phu/ -│ ├── en_template.py (required for English output) -│ ├── fr_template.py (required for French output) -│ ├── conf.typ (required) -│ └── assets/ (optional - only if templates reference assets) -│ ├── logo.png (optional) -│ └── signature.png (optional) -``` - -## Running with PHU Templates +* Reproducibiltiy: downstream pipelines reference an exact, pinned version +* Consistency: all PHUs use the same reviewed and approved templates +* Governance: updates are traceable, reviewable, and versioned +* Modularity: templates remain independent from pipeline logic +* Security: PHU-specific assets remain internal and controlled -To use a PHU-specific template, specify the template name with `--template`: +This repository is intended for **internal** use and is included as a **submodule** inside downstream pipelienes under: -```bash -# Generate English notices -uv run viper students.xlsx en --template my_phu - -# Generate French notices -uv run viper students.xlsx fr --template my_phu +``` +/pipeline/phu_templates ``` -This will load templates from `phu_templates/my_phu/`. - -## Template File Requirements - -### Core Requirements (Always Required) - -- `conf.typ` - Typst configuration and utility functions - -### Language-Specific Requirements (Based on Output Language) - -- `en_template.py` - Required only if generating English notices (`--language en`) - - Must define `render_notice()` function - - Consulted only when `--language en` is specified - -- `fr_template.py` - Required only if generating French notices (`--language fr`) - - Must define `render_notice()` function - - Consulted only when `--language fr` is specified - -**Note:** A PHU may provide templates for only one language. If a user requests a language your template does not support, the pipeline will fail with a clear error message. If you only support one language, only include that template file (e.g., only `en_template.py`). - -### Asset Requirements (Based on Template Implementation) - -Assets in the `assets/` directory are **optional** and depend entirely on your template implementation: - -- `assets/logo.png` - Only required if your `en_template.py` or `fr_template.py` references a logo -- `assets/signature.png` - Only required if your `en_template.py` or `fr_template.py` references a signature -- Other files - Any additional assets (e.g., `assets/header.png`, `assets/seal.pdf`) may be included and referenced in your templates +Pipeline developers are encouraged to update the submodule when template changes are required, ensuring changes are reviewed and versioned before integration. -**Note:** If your template references an asset (e.g., `include "assets/logo.png"` in Typst), that asset **must** exist. The pipeline will fail with a clear error if a referenced asset is missing. +## Directory Structure -## Creating a PHU Template +Templates are stored inside a folder named using the standard PHU acronym. -If your PHU supports both English and French: +Within each PHU folder, templates and assets can follow the following structure: -```bash -cp -r templates/ phu_templates/my_phu/ +``` +/ +├── assets/ +│ ├── logo.png +│ └── signature.png +├──en_template.py +├──fr_template.py ``` -Then customize: -- Replace `assets/logo.png` with your PHU logo -- Replace `assets/signature.png` with your signature -- Modify `en_template.py` and `fr_template.py` as needed -- Adjust `conf.typ` for organization-specific styling +## Using the Submodule -### Testing Your Template +Downstream VIPER pipelines consume this repository as a Git submodule, allowing each pipeline to reference a specific, version-controlled snapshot of the templates. -```bash -# Test English generation -uv run viper students.xlsx en --template my_phu +### Adding the Submodule (Initial Setup) -# Test French generation (if you provided fr_template.py) -uv run viper students.xlsx fr --template my_phu -``` +If the pipeline does not yet include the template library: -If a language template is missing: ``` -FileNotFoundError: Template module not found: /path/to/phu_templates/my_phu/fr_template.py -Expected fr_template.py in /path/to/phu_templates/my_phu +git submodule add phu_templates +git submodule update --init --recursive ``` -If an asset referenced by your template is missing: +This will create: + ``` -FileNotFoundError: Logo not found: /path/to/phu_templates/my_phu/assets/logo.png +phu_templates/ # Points to this template library ``` -## Git Considerations - -**Important:** PHU-specific templates are excluded from version control via `.gitignore`. - -- Templates in this directory will NOT be committed to the main repository -- Each PHU should maintain their templates in their own fork or separate repository -- The `README.md` file and `.gitkeep` are the only tracked files in this directory \ No newline at end of file diff --git a/pipeline/preprocess.py b/pipeline/preprocess.py index eda11a7..e3834d3 100644 --- a/pipeline/preprocess.py +++ b/pipeline/preprocess.py @@ -654,51 +654,65 @@ def enrich_grouped_records( chart_diseases_header: List[str] | None = None, ignore_diseases: List[str] | None = None, ) -> List[Dict[str, Any]]: - """Enrich grouped vaccine records with disease information. - If chart_diseases_header is provided, diseases not in the list are - collapsed into the "Other" category. Diseases in ignore_diseases - are always removed. - """ + enriched = [] + ignore_set = set(ignore_diseases or []) + for item in grouped: - enriched: List[Dict[str, Any]] = [] + # Rule 1 — If ANY vaccine does not exist, drop entire row + if any(v not in vaccine_reference for v in item["vaccine"]): + continue - for item in grouped: - # Normalize vaccine names - vaccines = [ - v.replace("-unspecified", "*").replace(" unspecified", "*") - for v in item["vaccine"] + final_vaccines = [] + final_diseases = [] + + for vaccine in item["vaccine"]: + + disease_mapping = vaccine_reference.get(vaccine) + + # Rule 2 — If mapping is "Other", skip this vaccine entirely + if disease_mapping == ["Other"] or disease_mapping == "Other": + continue + + # Add vaccine to valid list + final_vaccines.append(vaccine) + + # Add disease mapping + if isinstance(disease_mapping, list): + final_diseases.extend(disease_mapping) + + # Rule 3 — HepB or HPV → also append "Other" + if any(d in ["Hepatitis B", "HPV"] for d in disease_mapping): + final_diseases.append("Other") + + # If no vaccines remain, drop entire row + if not final_vaccines: + continue + + # Rule 4 — Remove ignored diseases + final_diseases = [d for d in final_diseases if d not in ignore_set] + + # Rule 5 — Collapse diseases not in header → "Other" + if chart_diseases_header: + allowed = set(chart_diseases_header) + final_diseases = [ + d if d in allowed else "Other" + for d in final_diseases + ] + + vaccines_normalized = [ + v.replace("-unspecified", "*").replace(" unspecified", "*") + for v in final_vaccines ] - # Lookup diseases - diseases: List[str] = [] - for vaccine in vaccines: - # See if the vaccine has a mapping to diseases - has_mapping = vaccine in vaccine_reference - if not has_mapping: - # Do not add to list - # Remove from vaccines list - # Remove whole item from grouped if no vaccines left - grouped.remove(item) - break - else: - ref = vaccine_reference.get(vaccine, vaccine) - if isinstance(ref, list): - diseases.extend(ref) - - if "Hepatitis B" in diseases or "HPV" in diseases: - print("Found Hepatitis B or HPV in diseases") - print(ref) - diseases.extend(["Other"]) - - enriched.append( - { - "date_given": item["date_given"], - "vaccine": vaccines, - "diseases": diseases, - } + enriched.append( + { + "date_given": item["date_given"], + "vaccine": vaccines_normalized, + "diseases": final_diseases, + } ) return enriched diff --git a/templates/conf.typ b/templates/conf.typ index 9b23ccb..fe30b29 100644 --- a/templates/conf.typ +++ b/templates/conf.typ @@ -61,7 +61,7 @@ let table_content = align(center)[ #table( columns: columns, - rows: (envelope_window_height), + rows: auto, inset: font_size, col1_content, table.vline(stroke: vline_stroke),