From bd38404de1234fbf027fbd749d894c7b1edc1f4b Mon Sep 17 00:00:00 2001 From: Erfan Nourbakhsh Date: Thu, 8 May 2025 05:26:05 -0400 Subject: [PATCH 1/4] Enable exception diagnostics output in pipetask report --- python/lsst/ctrl/mpexec/cli/cmd/commands.py | 26 +++- python/lsst/ctrl/mpexec/cli/script/report.py | 153 +++++++++++++++++-- 2 files changed, 162 insertions(+), 17 deletions(-) diff --git a/python/lsst/ctrl/mpexec/cli/cmd/commands.py b/python/lsst/ctrl/mpexec/cli/cmd/commands.py index 7116d351..65b437b6 100644 --- a/python/lsst/ctrl/mpexec/cli/cmd/commands.py +++ b/python/lsst/ctrl/mpexec/cli/cmd/commands.py @@ -338,6 +338,22 @@ def update_graph_run( "or when using the --force-v2 option, this should be a json file. We will be " "deprecating the single-graph-only (QuantumGraphExecutionReport) option soon.", ) +@click.option( + "--show-exception-diagnostics", + is_flag=True, + default=False, + help="Show exception diagnostics in the report. This is a detailed report of " + "the exceptions that occurred during the execution of the pipeline, combined " + "with the exposure dimension records.", +) +@click.option( + "--exception-diagnostics-filename", + default="", + help="Write to a file with this name a detailed exception report joined with exposure " + "dimension records. It's useful for spotting trends in exception types across data IDs. " + "Uses Astropy-supported formats based on file extension (e.g., html, csv, ecsv, pandas.json). " + "Full list: https://docs.astropy.org/en/stable/io/unified.html", +) @click.option("--logs/--no-logs", default=True, help="Get butler log datasets for extra information.") @click.option( "--brief", @@ -384,6 +400,8 @@ def report( collections: Sequence[str] | None, where: str, full_output_filename: str = "", + exception_diagnostics_filename: str = "", + show_exception_diagnostics: bool = False, logs: bool = True, brief: bool = False, curse_failed_logs: bool = False, @@ -422,6 +440,8 @@ def report( collections, where, full_output_filename, + exception_diagnostics_filename, + show_exception_diagnostics, logs, brief, curse_failed_logs, @@ -450,7 +470,9 @@ def report( " also printed to the screen when using the --full-output-filename option.", ) def aggregate_reports( - filenames: Sequence[str], full_output_filename: str | None, brief: bool = False + filenames: Sequence[str], + full_output_filename: str | None, + brief: bool = False, ) -> None: """Aggregate pipetask report output on disjoint data-id groups into one Summary over common tasks and datasets. Intended for use when the same @@ -466,4 +488,4 @@ def aggregate_reports( FILENAMES are the space-separated paths to json file output created by pipetask report. """ - script.aggregate_reports(filenames, full_output_filename, brief) + script.aggregate_reports(filenames, full_output_filename, brief=brief) diff --git a/python/lsst/ctrl/mpexec/cli/script/report.py b/python/lsst/ctrl/mpexec/cli/script/report.py index dce4adfb..ada9ebfe 100644 --- a/python/lsst/ctrl/mpexec/cli/script/report.py +++ b/python/lsst/ctrl/mpexec/cli/script/report.py @@ -28,6 +28,7 @@ from collections.abc import Sequence from typing import Literal +from astropy.io.registry import get_formats from astropy.table import Table from lsst.daf.butler import Butler @@ -122,6 +123,8 @@ def report_v2( collections: Sequence[str] | None, where: str, full_output_filename: str | None, + exception_diagnostics_filename: str | None, + show_exception_diagnostics: bool = False, logs: bool = True, brief: bool = False, curse_failed_logs: bool = False, @@ -141,9 +144,9 @@ def report_v2( butler_config : `str` The Butler used for this report. This should match the Butler used for the run associated with the executed quantum graph. - qgraph_uris : `Sequence` [`str`] + qgraph_uris : `Sequence` of `str` One or more uris to the serialized Quantum Graph(s). - collections : `Sequence` [`str`] | None` + collections : `Sequence` of `str` or `None` Collection(s) associated with said graphs/processing. For use in `lsst.daf.butler.registry.queryDatasets` if paring down the query would be useful. @@ -155,14 +158,27 @@ def report_v2( tools such as the ones used by Campaign Management software and pilots, and for searching and counting specific kinds or instances of failures. This option will also print a "brief" (counts-only) summary to stdout. - logs : `bool` + exception_diagnostics_filename : `str` + Output the exception diagnostics into an Astropy-supported format + based on file extension (e.g., html, csv, ecsv, pandas.json). + Full list: https://docs.astropy.org/en/stable/io/unified.html. + The "html" and "htm" formats are recommended as they'll be made + interactive (searchable and sortable) using Astropy's JSViewer writer. + This is a troubleshooting-oriented report of the exceptions combined + with the exposure dimension records. + show_exception_diagnostics : `bool` + Show exception diagnostics on stdout. This will be a truncated view of + the exception diagnostics table, if the table is too large. To save the + full table to a file, use the ``exception_diagnostics_filename`` + argument. + logs : `bool`, optional Store error messages from Butler logs associated with failed quanta if `True`. - brief : `bool` + brief : `bool`, optional Only display short (counts-only) summary on stdout. This includes counts and not error messages or data_ids (similar to BPS report). This option will still report all `cursed` datasets and `wonky` quanta. - curse_failed_logs : `bool` + curse_failed_logs : `bool`, optional Mark log datasets as `cursed` if they are published in the final output collection. Note that a campaign-level collection must be used here for `collections` if `curse_failed_logs` is `True`; if @@ -183,7 +199,7 @@ def report_v2( This should reduce the number of database operations. n_cores : `int`, optional Number of cores for metadata and log reads. - view_graph : `bool` + view_graph : `bool`, optional Display a graph representation of `QuantumProvenanceGraph.Summary` on stdout instead of the default plain-text summary. Pipeline graph nodes are then annotated with their status. This is a useful way to visualize @@ -228,8 +244,22 @@ def report_v2( status_annotator=status_annotator, status_options=status_options, ) + + if full_output_filename or exception_diagnostics_filename or show_exception_diagnostics: + print( + "Warning: you have requested to write the summary to a file or show or save exception " + "diagnostics, but this will not be done when viewing the graph. Please run again without " + "--view_graph to get those outputs." + ) else: - print_summary(summary, full_output_filename, brief) + print_summary( + summary, + full_output_filename, + exception_diagnostics_filename=exception_diagnostics_filename, + show_exception_diagnostics=show_exception_diagnostics, + brief=brief, + butler=butler if show_exception_diagnostics or exception_diagnostics_filename else None, + ) def aggregate_reports( @@ -242,15 +272,15 @@ def aggregate_reports( Parameters ---------- - filenames : `Sequence[str]` + filenames : `Sequence` of `str` The paths to the JSON files produced by `pipetask report` (note: this is only compatible with the multi-graph or `--force-v2` option). These files correspond to the `QuantumProvenanceGraph.Summary` objects which are produced for each group. - full_output_filename : `str | None` + full_output_filename : `str` or `None` The name of the JSON file in which to store the aggregate report, if passed. This is passed to `print_summary` at the end of this function. - brief : `bool = False` + brief : `bool`, optional Only display short (counts-only) summary on stdout. This includes counts and not error messages or data_ids (similar to BPS report). This option will still report all `cursed` datasets and `wonky` @@ -262,10 +292,54 @@ def aggregate_reports( model = Summary.model_validate_json(f.read()) summaries.extend([model]) result = Summary.aggregate(summaries) - print_summary(result, full_output_filename, brief) + print_summary(result, full_output_filename, brief=brief) + + +def infer_write_format_from_filename(filename: str, default: str = "jsviewer") -> str: + """Guess the appropriate Astropy `Table.write` format from a filename. + + Parameters + ---------- + filename : `str` + The name of the file to be written. + default : `str`, optional + The default format to return if no match is found. Default is + "jsviewer" for interactive HTML output. + + Returns + ------- + format : str + A valid format string that can be passed to `Table.write(format=...)`. + """ + formats = get_formats(Table) + write_formats = {str(f["Format"]).lower() for f in formats if f["Write"]} + + filename = filename.lower() + parts = filename.split(".") + + # Try full two-part suffix (e.g., "pandas.json"). + if len(parts) >= 2: + suffix = ".".join(parts[-2:]) + if suffix in write_formats: + return suffix + + # Try just the last extension (e.g., "ecsv"). + if len(parts) >= 1: + ext = parts[-1] + if ext in write_formats: + return ext + return default -def print_summary(summary: Summary, full_output_filename: str | None, brief: bool = False) -> None: + +def print_summary( + summary: Summary, + full_output_filename: str | None, + exception_diagnostics_filename: str | None = None, + show_exception_diagnostics: bool = False, + brief: bool = False, + butler: Butler | None = None, +) -> None: """Take a `QuantumProvenanceGraph.Summary` object and write it to a file and/or the screen. @@ -274,15 +348,64 @@ def print_summary(summary: Summary, full_output_filename: str | None, brief: boo summary : `QuantumProvenanceGraph.Summary` This `Pydantic` model contains all the information derived from the `QuantumProvenanceGraph`. - full_output_filename : `str | None` + full_output_filename : `str` or `None` Name of the JSON file in which to store summary information, if passed. + exception_diagnostics_filename : `str` or `None` + Name of the file to write exception diagnostics to, if provided. This + report combines exception information with exposure dimension records, + useful for troubleshooting. The output format is inferred from the + filename extension. If the extension is "htm", "html", or unrecognized, + the format defaults to "jsviewer" to generate an interactive, + searchable and sortable HTML table. + show_exception_diagnostics : `bool` + Display exception diagnostics on stdout. If the table is too large, + only a truncated version will be shown. To save the complete table, use + the ``exception_diagnostics_filename`` argument. brief : `bool` Only display short (counts-only) summary on stdout. This includes counts and not error messages or data_ids (similar to BPS report). - Ignored (considered `False`) if ``full_output_filename`` is passed. + Ignored (considered `False`) if ``full_output_filename`` or + ``exception_diagnostics_filename`` is passed. + butler : `lsst.daf.butler.Butler` or `None` + Butler to use for the report. This is needed to get the exposure + dimension records for the exception diagnostics. """ - summary.pprint(brief=(brief or bool(full_output_filename))) + summary.pprint( + brief=(brief or bool(full_output_filename) or bool(exception_diagnostics_filename)), + show_exception_diagnostics=show_exception_diagnostics, + butler=butler, + ) if full_output_filename: + # Write the full summary to a file. with open(full_output_filename, "w") as stream: stream.write(summary.model_dump_json(indent=2)) + print(f"Full summary written to {full_output_filename}") + + if exception_diagnostics_filename: + exception_diagnostics_table = summary.make_exception_diagnostics_table(butler) + + # Check the file extension to determine the format. + fmt = infer_write_format_from_filename(exception_diagnostics_filename) + + kwargs: dict[str, bool | int | str] = {"overwrite": True} + + if fmt in ("htm", "html"): + # Use the interactive HTML format for HTML files. + fmt = "jsviewer" + + kwargs["format"] = fmt + + if fmt == "jsviewer": + # Set the maximum number of lines to the length of the table. + # Otherwise it'll default to 5000 lines. + kwargs["max_lines"] = len(exception_diagnostics_table) + + # If the filename doesn't end with ".html" or ".htm", add ".html". + # This is needed for the file to be recognized as an HTML file. + if not exception_diagnostics_filename.endswith((".html", ".htm")): + exception_diagnostics_filename += ".html" + + # Write the exception diagnostics table to a file. + exception_diagnostics_table.write(exception_diagnostics_filename, **kwargs) + print(f"Exception diagnostics written to {exception_diagnostics_filename}") From 81fd77ac51db6a1795c5ec9d0ecac9b4c30b273a Mon Sep 17 00:00:00 2001 From: Erfan Nourbakhsh Date: Sun, 11 May 2025 17:03:24 -0400 Subject: [PATCH 2/4] Ignore mypy abstract class error --- python/lsst/ctrl/mpexec/cli/pipetask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/lsst/ctrl/mpexec/cli/pipetask.py b/python/lsst/ctrl/mpexec/cli/pipetask.py index fb1d41d7..f369c46a 100644 --- a/python/lsst/ctrl/mpexec/cli/pipetask.py +++ b/python/lsst/ctrl/mpexec/cli/pipetask.py @@ -43,7 +43,7 @@ class PipetaskCLI(LoaderCLI): localCmdPkg = "lsst.ctrl.mpexec.cli.cmd" -@click.command(cls=PipetaskCLI, context_settings=dict(help_option_names=["-h", "--help"])) +@click.command(cls=PipetaskCLI, context_settings=dict(help_option_names=["-h", "--help"])) # type: ignore @log_level_option() @long_log_option() @log_file_option() From f6f6b73f7d81174085b00334a69ced8953299783 Mon Sep 17 00:00:00 2001 From: Erfan Nourbakhsh Date: Mon, 12 May 2025 16:17:20 -0400 Subject: [PATCH 3/4] Add release note for exception diagnostics output in pipetask report --- doc/changes/DM-50550.feature.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 doc/changes/DM-50550.feature.md diff --git a/doc/changes/DM-50550.feature.md b/doc/changes/DM-50550.feature.md new file mode 100644 index 00000000..8985a6c7 --- /dev/null +++ b/doc/changes/DM-50550.feature.md @@ -0,0 +1 @@ +Enable exception diagnostics output in `pipetask report`. \ No newline at end of file From 85232cc19fa801032f4b6e94beff81577dc1fd43 Mon Sep 17 00:00:00 2001 From: Erfan Nourbakhsh Date: Tue, 13 May 2025 16:18:17 -0400 Subject: [PATCH 4/4] Reuse exception diagnostics table if already available --- python/lsst/ctrl/mpexec/cli/script/report.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/lsst/ctrl/mpexec/cli/script/report.py b/python/lsst/ctrl/mpexec/cli/script/report.py index ada9ebfe..10af9872 100644 --- a/python/lsst/ctrl/mpexec/cli/script/report.py +++ b/python/lsst/ctrl/mpexec/cli/script/report.py @@ -371,10 +371,11 @@ def print_summary( Butler to use for the report. This is needed to get the exposure dimension records for the exception diagnostics. """ - summary.pprint( + exception_diagnostics_table = summary.pprint( brief=(brief or bool(full_output_filename) or bool(exception_diagnostics_filename)), show_exception_diagnostics=show_exception_diagnostics, butler=butler, + return_exception_diagnostics_table=show_exception_diagnostics and exception_diagnostics_filename, ) if full_output_filename: # Write the full summary to a file. @@ -383,7 +384,10 @@ def print_summary( print(f"Full summary written to {full_output_filename}") if exception_diagnostics_filename: - exception_diagnostics_table = summary.make_exception_diagnostics_table(butler) + # Create the exception diagnostics table if it wasn't created in the + # previous step due to `show_exception_diagnostics` being False. + if not exception_diagnostics_table: + exception_diagnostics_table = summary.make_exception_diagnostics_table(butler) # Check the file extension to determine the format. fmt = infer_write_format_from_filename(exception_diagnostics_filename)