diff --git a/examples/Basics/simple_flows_and_runs_tutorial.py b/examples/Basics/simple_flows_and_runs_tutorial.py index 41eed9234..050f3353a 100644 --- a/examples/Basics/simple_flows_and_runs_tutorial.py +++ b/examples/Basics/simple_flows_and_runs_tutorial.py @@ -2,9 +2,8 @@ # A simple tutorial on how to upload results from a machine learning experiment to OpenML. # %% -import sklearn from sklearn.neighbors import KNeighborsClassifier - +import sklearn import openml # %% [markdown] @@ -54,7 +53,17 @@ # %% [markdown] # ## Upload the machine learning experiments to OpenML -# First, create a fow and fill it with metadata about the machine learning model. +# +# ### Option A: Automatic publishing (simplified) +# The publish function automatically detects the model type and creates the flow: + +# %% +knn_flow = openml.publish(clf, tags=["openml_tutorial_knn"]) +print(f"Flow was auto-published with ID {knn_flow.flow_id}") + +# %% [markdown] +# ### Option B: Manual flow construction (full control) +# For advanced use cases, you can manually construct the flow: # %% knn_flow = openml.flows.OpenMLFlow( @@ -77,6 +86,9 @@ knn_flow.publish() print(f"knn_flow was published with the ID {knn_flow.flow_id}") +# %% [markdown] +# Now we'll use the auto-published flow to create and upload a run. + # %% [markdown] # Second, we create a run to store the results associated with the flow. diff --git a/openml/__init__.py b/openml/__init__.py index c49505eb9..0ff233394 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -18,6 +18,9 @@ # License: BSD 3-Clause from __future__ import annotations +import contextlib +from typing import Any, Sequence + from . import ( _api_calls, config, @@ -33,6 +36,7 @@ utils, ) from .__version__ import __version__ +from .base import OpenMLBase from .datasets import OpenMLDataFeature, OpenMLDataset from .evaluations import OpenMLEvaluation from .flows import OpenMLFlow @@ -50,6 +54,122 @@ ) +def publish(obj: Any, *, name: str | None = None, tags: Sequence[str] | None = None) -> Any: + """Publish a common object (flow/model/run/dataset) with minimal friction. + + This function provides a unified entry point for publishing various OpenML objects. + It automatically detects the object type and routes to the appropriate publishing + mechanism: + + - For OpenML objects (``OpenMLDataset``, ``OpenMLFlow``, ``OpenMLRun``, etc.), + it directly calls their ``publish()`` method. + - For external models (e.g., scikit-learn estimators), it uses registered + extensions to convert them to ``OpenMLFlow`` objects before publishing. + + Parameters + ---------- + obj : Any + The object to publish. Can be: + - An OpenML object (OpenMLDataset, OpenMLFlow, OpenMLRun, OpenMLTask) + - A machine learning model from a supported framework (e.g., scikit-learn) + name : str, optional + Override the default name for the published object. + If not provided, uses the object's default naming convention. + tags : Sequence[str], optional + Additional tags to attach to the published object. + Will be merged with any existing tags, removing duplicates while + preserving order. + + Returns + ------- + Any + The published object (typically with updated ID and metadata). + + Raises + ------ + ValueError + If no extension is registered to handle the provided model type. + + Examples + -------- + Publishing an OpenML dataset: + + >>> dataset = openml.datasets.get_dataset(61) + >>> openml.publish(dataset, tags=["example"]) + + Publishing a scikit-learn model: + + >>> from sklearn.tree import DecisionTreeClassifier + >>> clf = DecisionTreeClassifier(max_depth=5) + >>> openml.publish(clf, name="MyDecisionTree", tags=["tutorial"]) + + Publishing an OpenML flow directly: + + >>> flow = openml.flows.OpenMLFlow(...) + >>> openml.publish(flow) + + Publishing an OpenML run (after execution with predictions): + + >>> run = openml.runs.OpenMLRun( + ... task_id=1, flow_id=100, dataset_id=61, + ... data_content=predictions # predictions from model evaluation + ... ) + >>> openml.publish(run, tags=["experiment"]) + + Notes + ----- + For external models (e.g., scikit-learn), the corresponding extension must be + installed (e.g., ``openml-sklearn``). The extension will be automatically imported + if available. + """ + # Case 1: Object is already an OpenML entity + if isinstance(obj, OpenMLBase): + if tags is not None and hasattr(obj, "tags"): + existing = list(getattr(obj, "tags", []) or []) + merged = list(dict.fromkeys([*existing, *tags])) + obj.tags = merged + if name is not None and hasattr(obj, "name"): + obj.name = name + return obj.publish() + + # Case 2: Object is an external model - use extension registry + # Attempt to auto-import common extensions + _ensure_extension_imported(obj) + + extension = extensions.functions.get_extension_by_model(obj, raise_if_no_extension=True) + if extension is None: # Defensive check (should not occur with raise_if_no_extension=True) + raise ValueError("No extension registered to handle the provided object.") + flow = extension.model_to_flow(obj) + + if name is not None: + flow.name = name + + if tags is not None: + existing_tags = list(getattr(flow, "tags", []) or []) + flow.tags = list(dict.fromkeys([*existing_tags, *tags])) + + return flow.publish() + + +def _ensure_extension_imported(obj: Any) -> None: + """Attempt to import the appropriate extension for common frameworks. + + This is a convenience helper to automatically import extensions for + well-known frameworks, reducing friction for users. + + Parameters + ---------- + obj : Any + The object to check. + """ + obj_module = type(obj).__module__ + + # Check for scikit-learn models + if obj_module.startswith("sklearn"): + with contextlib.suppress(ImportError): + import openml_sklearn # noqa: F401 + + def populate_cache( task_ids: list[int] | None = None, dataset_ids: list[int | str] | None = None, @@ -120,4 +240,5 @@ def populate_cache( "utils", "_api_calls", "__version__", + "publish", ] diff --git a/tests/test_openml/test_openml.py b/tests/test_openml/test_openml.py index 998046726..28e6c4e1c 100644 --- a/tests/test_openml/test_openml.py +++ b/tests/test_openml/test_openml.py @@ -41,3 +41,54 @@ def test_populate_cache( assert task_mock.call_count == 2 for argument, fixture in zip(task_mock.call_args_list, [(1,), (2,)]): assert argument[0] == fixture + + def test_publish_with_openml_object_merges_tags_and_name(self): + class Dummy(openml.base.OpenMLBase): + def __init__(self) -> None: + self.tags = ["a"] + self.name = "orig" + self.published = False + + @property + def id(self): + return None + + def _get_repr_body_fields(self): + return [] + + def _to_dict(self): + return {} + + def _parse_publish_response(self, xml_response): + return None + + def publish(self): + self.published = True + return self + + obj = Dummy() + result = openml.publish(obj, name="new", tags=["b", "a"]) + assert result is obj + assert obj.published is True + assert obj.name == "new" + assert obj.tags == ["a", "b"] # dedup and preserve order from original + + @mock.patch("openml.extensions.functions.get_extension_by_model") + def test_publish_with_extension(self, get_ext_mock): + flow_mock = mock.MagicMock() + flow_mock.tags = [] + flow_mock.publish.return_value = "flow-id" + + ext_instance = mock.MagicMock() + ext_instance.model_to_flow.return_value = flow_mock + get_ext_mock.return_value = ext_instance + + model = object() + flow_id = openml.publish(model, name="n", tags=["x"]) + + get_ext_mock.assert_called_once_with(model, raise_if_no_extension=True) + ext_instance.model_to_flow.assert_called_once_with(model) + assert flow_mock.name == "n" + assert flow_mock.tags == ["x"] + flow_mock.publish.assert_called_once_with() + assert flow_id == "flow-id"