diff --git a/config/parameters.yaml b/config/parameters.yaml index 5a76b33..7774932 100644 --- a/config/parameters.yaml +++ b/config/parameters.yaml @@ -1,6 +1,6 @@ bundling: bundle_size: 100 - group_by: null + group_by: school chart_diseases_header: - Diphtheria - Tetanus @@ -21,6 +21,8 @@ encryption: enabled: false password: template: '{date_of_birth_iso_compact}' +ignore_diseases: +- Other ignore_agents: - RSVAb - VarIg diff --git a/phu_templates/.gitkeep b/phu_templates/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/phu_templates/README.md b/phu_templates/README.md index 65309f5..c75c704 100644 --- a/phu_templates/README.md +++ b/phu_templates/README.md @@ -1,103 +1,56 @@ -# PHU Templates Directory +# WDGPH VIPER Template Library -This directory contains Public Health Unit (PHU) specific template customizations. +The VIPER Template Library is a private repository that provides a centralized, version-controlled collection of reusable templates that are used across provinical Public Health VIPER workflows. -## Usage +This includes Typst document templates, YAML configuration schemas, and other standardized assets reuqired by the VIPER pipeline. -Each PHU should create a subdirectory here with their organization-specific templates: +By storing these artifacts in a dedicated private repostiroy and consuming them as a Git submodule, we ensure the following: -``` -phu_templates/ -├── my_phu/ -│ ├── en_template.py (required for English output) -│ ├── fr_template.py (required for French output) -│ ├── conf.typ (required) -│ └── assets/ (optional - only if templates reference assets) -│ ├── logo.png (optional) -│ └── signature.png (optional) -``` - -## Running with PHU Templates +* Reproducibiltiy: downstream pipelines reference an exact, pinned version +* Consistency: all PHUs use the same reviewed and approved templates +* Governance: updates are traceable, reviewable, and versioned +* Modularity: templates remain independent from pipeline logic +* Security: PHU-specific assets remain internal and controlled -To use a PHU-specific template, specify the template name with `--template`: +This repository is intended for **internal** use and is included as a **submodule** inside downstream pipelienes under: -```bash -# Generate English notices -uv run viper students.xlsx en --template my_phu - -# Generate French notices -uv run viper students.xlsx fr --template my_phu +``` +/pipeline/phu_templates ``` -This will load templates from `phu_templates/my_phu/`. - -## Template File Requirements - -### Core Requirements (Always Required) - -- `conf.typ` - Typst configuration and utility functions - -### Language-Specific Requirements (Based on Output Language) - -- `en_template.py` - Required only if generating English notices (`--language en`) - - Must define `render_notice()` function - - Consulted only when `--language en` is specified - -- `fr_template.py` - Required only if generating French notices (`--language fr`) - - Must define `render_notice()` function - - Consulted only when `--language fr` is specified - -**Note:** A PHU may provide templates for only one language. If a user requests a language your template does not support, the pipeline will fail with a clear error message. If you only support one language, only include that template file (e.g., only `en_template.py`). - -### Asset Requirements (Based on Template Implementation) - -Assets in the `assets/` directory are **optional** and depend entirely on your template implementation: - -- `assets/logo.png` - Only required if your `en_template.py` or `fr_template.py` references a logo -- `assets/signature.png` - Only required if your `en_template.py` or `fr_template.py` references a signature -- Other files - Any additional assets (e.g., `assets/header.png`, `assets/seal.pdf`) may be included and referenced in your templates +Pipeline developers are encouraged to update the submodule when template changes are required, ensuring changes are reviewed and versioned before integration. -**Note:** If your template references an asset (e.g., `include "assets/logo.png"` in Typst), that asset **must** exist. The pipeline will fail with a clear error if a referenced asset is missing. +## Directory Structure -## Creating a PHU Template +Templates are stored inside a folder named using the standard PHU acronym. -If your PHU supports both English and French: +Within each PHU folder, templates and assets can follow the following structure: -```bash -cp -r templates/ phu_templates/my_phu/ +``` +/ +├── assets/ +│ ├── logo.png +│ └── signature.png +├──en_template.py +├──fr_template.py ``` -Then customize: -- Replace `assets/logo.png` with your PHU logo -- Replace `assets/signature.png` with your signature -- Modify `en_template.py` and `fr_template.py` as needed -- Adjust `conf.typ` for organization-specific styling +## Using the Submodule -### Testing Your Template +Downstream VIPER pipelines consume this repository as a Git submodule, allowing each pipeline to reference a specific, version-controlled snapshot of the templates. -```bash -# Test English generation -uv run viper students.xlsx en --template my_phu +### Adding the Submodule (Initial Setup) -# Test French generation (if you provided fr_template.py) -uv run viper students.xlsx fr --template my_phu -``` +If the pipeline does not yet include the template library: -If a language template is missing: ``` -FileNotFoundError: Template module not found: /path/to/phu_templates/my_phu/fr_template.py -Expected fr_template.py in /path/to/phu_templates/my_phu +git submodule add phu_templates +git submodule update --init --recursive ``` -If an asset referenced by your template is missing: +This will create: + ``` -FileNotFoundError: Logo not found: /path/to/phu_templates/my_phu/assets/logo.png +phu_templates/ # Points to this template library ``` -## Git Considerations - -**Important:** PHU-specific templates are excluded from version control via `.gitignore`. - -- Templates in this directory will NOT be committed to the main repository -- Each PHU should maintain their templates in their own fork or separate repository -- The `README.md` file and `.gitkeep` are the only tracked files in this directory \ No newline at end of file diff --git a/pipeline/orchestrator.py b/pipeline/orchestrator.py index 9517c6f..5395c68 100755 --- a/pipeline/orchestrator.py +++ b/pipeline/orchestrator.py @@ -209,6 +209,7 @@ def run_step_2_preprocess( output_dir: Path, language: str, run_id: str, + config_dir: Path, ) -> int: """Step 2: Preprocessing. @@ -231,12 +232,38 @@ def run_step_2_preprocess( df = preprocess.check_addresses_complete(df) # Load configuration + config = load_config(config_dir / "parameters.yaml") + ignore_diseases = config.get("ignore_diseases", {}) + vaccine_reference_path = preprocess.VACCINE_REFERENCE_PATH vaccine_reference = json.loads(vaccine_reference_path.read_text(encoding="utf-8")) + if ignore_diseases: + ignore_set = set(ignore_diseases) + + filtered_reference = {} + + for vaccine, agents in vaccine_reference.items(): + + # Remove ignored diseases from agent list + cleaned_agents = [a for a in agents if a not in ignore_set] + + # Keep only if something remains + if cleaned_agents: + filtered_reference[vaccine] = cleaned_agents + + print(f"Ignored diseases: {', '.join(sorted(ignore_set))}") + print( + f"Filtered vaccine reference to {len(filtered_reference)} vaccines " + f"from {len(vaccine_reference)} total." + ) + + else: + filtered_reference = vaccine_reference + # Build preprocessing result result = preprocess.build_preprocess_result( - df, language, vaccine_reference, preprocess.REPLACE_UNSPECIFIED + df, language, filtered_reference, preprocess.REPLACE_UNSPECIFIED, ignore_diseases ) # Write artifact @@ -574,6 +601,7 @@ def main() -> int: output_dir, args.language, run_id, + config_dir ) step_duration = time.time() - step_start step_times.append(("Preprocessing", step_duration)) diff --git a/pipeline/preprocess.py b/pipeline/preprocess.py index 6209d54..e3834d3 100644 --- a/pipeline/preprocess.py +++ b/pipeline/preprocess.py @@ -652,63 +652,69 @@ def enrich_grouped_records( vaccine_reference: Dict[str, Any], language: str, chart_diseases_header: List[str] | None = None, + ignore_diseases: List[str] | None = None, ) -> List[Dict[str, Any]]: - """Enrich grouped vaccine records with disease information. - If chart_diseases_header is provided, diseases not in the list are - collapsed into the "Other" category. + enriched = [] + ignore_set = set(ignore_diseases or []) - Parameters - ---------- - grouped : List[Dict[str, Any]] - Grouped vaccine records with date_given and vaccine list. - vaccine_reference : Dict[str, Any] - Map of vaccine codes to disease names. - language : str - Language code for logging. - chart_diseases_header : List[str], optional - List of diseases to include in chart. Diseases not in this list - are mapped to "Other". - - Returns - ------- - List[Dict[str, Any]] - Enriched records with date_given, vaccine, and diseases fields. - """ - enriched: List[Dict[str, Any]] = [] for item in grouped: - vaccines = [ - v.replace("-unspecified", "*").replace(" unspecified", "*") - for v in item["vaccine"] - ] - diseases: List[str] = [] - for vaccine in vaccines: - ref = vaccine_reference.get(vaccine, vaccine) - if isinstance(ref, list): - diseases.extend(ref) - else: - diseases.append(ref) - # Collapse diseases not in chart to "Other" + # Rule 1 — If ANY vaccine does not exist, drop entire row + if any(v not in vaccine_reference for v in item["vaccine"]): + continue + + final_vaccines = [] + final_diseases = [] + + for vaccine in item["vaccine"]: + + disease_mapping = vaccine_reference.get(vaccine) + + # Rule 2 — If mapping is "Other", skip this vaccine entirely + if disease_mapping == ["Other"] or disease_mapping == "Other": + continue + + # Add vaccine to valid list + final_vaccines.append(vaccine) + + # Add disease mapping + if isinstance(disease_mapping, list): + final_diseases.extend(disease_mapping) + + # Rule 3 — HepB or HPV → also append "Other" + if any(d in ["Hepatitis B", "HPV"] for d in disease_mapping): + final_diseases.append("Other") + + # If no vaccines remain, drop entire row + if not final_vaccines: + continue + + # Rule 4 — Remove ignored diseases + final_diseases = [d for d in final_diseases if d not in ignore_set] + + # Rule 5 — Collapse diseases not in header → "Other" if chart_diseases_header: - filtered_diseases: List[str] = [] - has_unmapped = False - for disease in diseases: - if disease in chart_diseases_header: - filtered_diseases.append(disease) - else: - has_unmapped = True - if has_unmapped and "Other" not in filtered_diseases: - filtered_diseases.append("Other") - diseases = filtered_diseases + allowed = set(chart_diseases_header) + final_diseases = [ + d if d in allowed else "Other" + for d in final_diseases + ] + + vaccines_normalized = [ + v.replace("-unspecified", "*").replace(" unspecified", "*") + for v in final_vaccines + ] + enriched.append( { "date_given": item["date_given"], - "vaccine": vaccines, - "diseases": diseases, + "vaccine": vaccines_normalized, + "diseases": final_diseases, } ) + return enriched @@ -717,6 +723,7 @@ def build_preprocess_result( language: str, vaccine_reference: Dict[str, Any], replace_unspecified: List[str], + ignore_diseases ) -> PreprocessResult: """Process and normalize client data into structured artifact. @@ -790,7 +797,7 @@ def build_preprocess_result( ] received_grouped = process_received_agents(row.IMMS_GIVEN, replace_unspecified) # type: ignore[attr-defined] received = enrich_grouped_records( - received_grouped, vaccine_reference, language, chart_diseases_header + received_grouped, vaccine_reference, language, chart_diseases_header, ignore_diseases=None ) postal_code = row.POSTAL_CODE if row.POSTAL_CODE else "Not provided" # type: ignore[attr-defined] diff --git a/templates/conf.typ b/templates/conf.typ index 9b23ccb..fe30b29 100644 --- a/templates/conf.typ +++ b/templates/conf.typ @@ -61,7 +61,7 @@ let table_content = align(center)[ #table( columns: columns, - rows: (envelope_window_height), + rows: auto, inset: font_size, col1_content, table.vline(stroke: vline_stroke), diff --git a/tests/unit/test_preprocess.py b/tests/unit/test_preprocess.py index 471ff23..7d47044 100644 --- a/tests/unit/test_preprocess.py +++ b/tests/unit/test_preprocess.py @@ -484,6 +484,7 @@ def test_build_result_generates_clients_with_sequences( language="en", vaccine_reference=default_vaccine_reference, replace_unspecified=[], + ignore_diseases=[], ) assert len(result.clients) == 3 @@ -509,6 +510,7 @@ def test_build_result_sorts_clients_deterministically( language="en", vaccine_reference=default_vaccine_reference, replace_unspecified=[], + ignore_diseases=[], ) result2 = preprocess.build_preprocess_result( @@ -516,6 +518,7 @@ def test_build_result_sorts_clients_deterministically( language="en", vaccine_reference=default_vaccine_reference, replace_unspecified=[], + ignore_diseases=[], ) ids1 = [c.client_id for c in result1.clients] @@ -570,6 +573,7 @@ def test_build_result_sorts_by_school_then_name( language="en", vaccine_reference=default_vaccine_reference, replace_unspecified=[], + ignore_diseases=[], ) # Expected order: Apple/Chloe/Jones, Apple/Diana/Jones, Zebra/Alice/Smith, Zebra/Bob/Smith @@ -596,6 +600,7 @@ def test_build_result_maps_vaccines_correctly( language="en", vaccine_reference=default_vaccine_reference, replace_unspecified=[], + ignore_diseases=[], ) # Should have DTaP expanded to component diseases @@ -638,6 +643,7 @@ def test_build_result_handles_missing_board_name_with_warning( language="en", vaccine_reference=default_vaccine_reference, replace_unspecified=[], + ignore_diseases=[], ) # Should still process - at least one client @@ -661,15 +667,16 @@ def test_build_result_french_language_support( language="fr", vaccine_reference=default_vaccine_reference, replace_unspecified=[], + ignore_diseases=[], ) assert len(result.clients) == 1 assert result.clients[0].language == "fr" - def test_build_result_handles_replace_unspecified( + def test_build_result_handles_ignore_agents( self, default_vaccine_reference ) -> None: - """Verify replace_unspecified filters out unspecified vaccines. + """Verify ignore_diseases filters out unspecified vaccines. Real-world significance: - Input may contain "Not Specified" vaccine agents @@ -682,7 +689,8 @@ def test_build_result_handles_replace_unspecified( normalized, language="en", vaccine_reference=default_vaccine_reference, - replace_unspecified=["Not Specified", "unspecified"], + replace_unspecified=[], + ignore_diseases=["Not Specified", "unspecified"], ) assert len(result.clients) == 1 @@ -709,6 +717,7 @@ def test_build_result_detects_duplicate_client_ids( language="en", vaccine_reference=default_vaccine_reference, replace_unspecified=[], + ignore_diseases=[], ) # Should have 2 clients (no deduplication) @@ -745,6 +754,7 @@ def test_build_result_detects_multiple_duplicate_client_ids( language="en", vaccine_reference=default_vaccine_reference, replace_unspecified=[], + ignore_diseases=[], ) # Should have 5 clients (no deduplication) @@ -777,6 +787,7 @@ def test_build_result_no_warning_for_unique_client_ids( language="en", vaccine_reference=default_vaccine_reference, replace_unspecified=[], + ignore_diseases=[], ) # Should have 3 unique clients diff --git a/tests/unit/test_run_pipeline.py b/tests/unit/test_run_pipeline.py index dff22a8..10dcc73 100644 --- a/tests/unit/test_run_pipeline.py +++ b/tests/unit/test_run_pipeline.py @@ -208,7 +208,7 @@ def test_run_step_1_prepare_output_user_cancels( ) assert result is False - def test_run_step_2_preprocess(self, tmp_test_dir: Path, tmp_output_structure: dict) -> None: + def test_run_step_2_preprocess(self, tmp_test_dir: Path, tmp_output_structure: dict, config_file: Path) -> None: """Verify Step 2: preprocess returns client count.""" mock_df = MagicMock() mock_mapped_df = MagicMock() @@ -236,6 +236,7 @@ def test_run_step_2_preprocess(self, tmp_test_dir: Path, tmp_output_structure: d output_dir=tmp_output_structure["root"], language="en", run_id="test_20250101_120000", + config_dir=config_file.parent, ) assert total == 2