From 0f216401c32a5a8a1091373122426259d206c0c0 Mon Sep 17 00:00:00 2001 From: Omswastik-11 Date: Wed, 24 Dec 2025 15:55:36 +0530 Subject: [PATCH 1/4] improve publish api for users --- .../Basics/simple_flows_and_runs_tutorial.py | 8 +++ openml/__init__.py | 35 +++++++++++++ tests/test_openml/test_openml.py | 51 +++++++++++++++++++ 3 files changed, 94 insertions(+) diff --git a/examples/Basics/simple_flows_and_runs_tutorial.py b/examples/Basics/simple_flows_and_runs_tutorial.py index 41eed9234..f5c165214 100644 --- a/examples/Basics/simple_flows_and_runs_tutorial.py +++ b/examples/Basics/simple_flows_and_runs_tutorial.py @@ -48,6 +48,13 @@ clf = KNeighborsClassifier(**knn_parameters) clf.fit(X_train, y_train) +# Option A: auto-publish the estimator via unified helper (requires openml-sklearn extension). +try: + flow_id = openml.publish(clf) + print(f"Auto-published flow id: {flow_id}") +except Exception as ex: # pragma: no cover - example path + print(f"Auto-publish failed (is openml-sklearn installed?): {ex}") + # Get experiment results y_pred = clf.predict(X_test) y_pred_proba = clf.predict_proba(X_test) @@ -57,6 +64,7 @@ # First, create a fow and fill it with metadata about the machine learning model. # %% +# Option B: manually build the flow knn_flow = openml.flows.OpenMLFlow( # Metadata model=clf, # or None, if you do not want to upload the model object. diff --git a/openml/__init__.py b/openml/__init__.py index c49505eb9..d691bd22b 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -18,6 +18,8 @@ # License: BSD 3-Clause from __future__ import annotations +from typing import Any, Sequence + from . import ( _api_calls, config, @@ -33,6 +35,7 @@ utils, ) from .__version__ import __version__ +from .base import OpenMLBase from .datasets import OpenMLDataFeature, OpenMLDataset from .evaluations import OpenMLEvaluation from .flows import OpenMLFlow @@ -50,6 +53,37 @@ ) +def publish(obj: Any, *, name: str | None = None, tags: Sequence[str] | None = None) -> Any: + """Publish a common object (flow/model/run/dataset) with minimal friction. + + If ``obj`` is already an OpenML object (``OpenMLBase``) it will call its ``publish`` method. + Otherwise it looks for a registered extension (e.g., scikit-learn) to convert the object + into an ``OpenMLFlow`` and publish it. + """ + if isinstance(obj, OpenMLBase): + if tags is not None and hasattr(obj, "tags"): + existing = list(getattr(obj, "tags", []) or []) + merged = list(dict.fromkeys([*existing, *tags])) + obj.tags = merged + if name is not None and hasattr(obj, "name"): + obj.name = name + return obj.publish() + + extension = extensions.functions.get_extension_by_model(obj, raise_if_no_extension=True) + if extension is None: # defensive; should not happen with raise_if_no_extension=True + raise ValueError("No extension registered to handle the provided object.") + flow = extension.model_to_flow(obj) + + if name is not None: + flow.name = name + + if tags is not None: + existing_tags = list(getattr(flow, "tags", []) or []) + flow.tags = list(dict.fromkeys([*existing_tags, *tags])) + + return flow.publish() + + def populate_cache( task_ids: list[int] | None = None, dataset_ids: list[int | str] | None = None, @@ -91,6 +125,7 @@ def populate_cache( __all__ = [ + "publish", "OpenMLDataset", "OpenMLDataFeature", "OpenMLRun", diff --git a/tests/test_openml/test_openml.py b/tests/test_openml/test_openml.py index 998046726..28e6c4e1c 100644 --- a/tests/test_openml/test_openml.py +++ b/tests/test_openml/test_openml.py @@ -41,3 +41,54 @@ def test_populate_cache( assert task_mock.call_count == 2 for argument, fixture in zip(task_mock.call_args_list, [(1,), (2,)]): assert argument[0] == fixture + + def test_publish_with_openml_object_merges_tags_and_name(self): + class Dummy(openml.base.OpenMLBase): + def __init__(self) -> None: + self.tags = ["a"] + self.name = "orig" + self.published = False + + @property + def id(self): + return None + + def _get_repr_body_fields(self): + return [] + + def _to_dict(self): + return {} + + def _parse_publish_response(self, xml_response): + return None + + def publish(self): + self.published = True + return self + + obj = Dummy() + result = openml.publish(obj, name="new", tags=["b", "a"]) + assert result is obj + assert obj.published is True + assert obj.name == "new" + assert obj.tags == ["a", "b"] # dedup and preserve order from original + + @mock.patch("openml.extensions.functions.get_extension_by_model") + def test_publish_with_extension(self, get_ext_mock): + flow_mock = mock.MagicMock() + flow_mock.tags = [] + flow_mock.publish.return_value = "flow-id" + + ext_instance = mock.MagicMock() + ext_instance.model_to_flow.return_value = flow_mock + get_ext_mock.return_value = ext_instance + + model = object() + flow_id = openml.publish(model, name="n", tags=["x"]) + + get_ext_mock.assert_called_once_with(model, raise_if_no_extension=True) + ext_instance.model_to_flow.assert_called_once_with(model) + assert flow_mock.name == "n" + assert flow_mock.tags == ["x"] + flow_mock.publish.assert_called_once_with() + assert flow_id == "flow-id" From 3b1d9616981291d0c0fc9f6896e8996e98fc73fe Mon Sep 17 00:00:00 2001 From: Omswastik-11 Date: Thu, 25 Dec 2025 13:20:24 +0530 Subject: [PATCH 2/4] improve doc-string --- .../Basics/simple_flows_and_runs_tutorial.py | 62 +++++---- openml/__init__.py | 128 +++++++++++++++--- 2 files changed, 140 insertions(+), 50 deletions(-) diff --git a/examples/Basics/simple_flows_and_runs_tutorial.py b/examples/Basics/simple_flows_and_runs_tutorial.py index f5c165214..05aea7a02 100644 --- a/examples/Basics/simple_flows_and_runs_tutorial.py +++ b/examples/Basics/simple_flows_and_runs_tutorial.py @@ -2,7 +2,6 @@ # A simple tutorial on how to upload results from a machine learning experiment to OpenML. # %% -import sklearn from sklearn.neighbors import KNeighborsClassifier import openml @@ -48,42 +47,47 @@ clf = KNeighborsClassifier(**knn_parameters) clf.fit(X_train, y_train) -# Option A: auto-publish the estimator via unified helper (requires openml-sklearn extension). -try: - flow_id = openml.publish(clf) - print(f"Auto-published flow id: {flow_id}") -except Exception as ex: # pragma: no cover - example path - print(f"Auto-publish failed (is openml-sklearn installed?): {ex}") - # Get experiment results y_pred = clf.predict(X_test) y_pred_proba = clf.predict_proba(X_test) # %% [markdown] # ## Upload the machine learning experiments to OpenML -# First, create a fow and fill it with metadata about the machine learning model. +# +# ### Option A: Automatic publishing (simplified) +# The publish function automatically detects the model type and creates the flow: # %% -# Option B: manually build the flow -knn_flow = openml.flows.OpenMLFlow( - # Metadata - model=clf, # or None, if you do not want to upload the model object. - name="CustomKNeighborsClassifier", - description="A custom KNeighborsClassifier flow for OpenML.", - external_version=f"{sklearn.__version__}", - language="English", - tags=["openml_tutorial_knn"], - dependencies=f"{sklearn.__version__}", - # Hyperparameters - parameters={k: str(v) for k, v in knn_parameters.items()}, - parameters_meta_info={ - "n_neighbors": {"description": "number of neighbors to use", "data_type": "int"} - }, - # If you have a pipeline with subcomponents, such as preprocessing, add them here. - components={}, -) -knn_flow.publish() -print(f"knn_flow was published with the ID {knn_flow.flow_id}") +knn_flow = openml.publish(clf, tags=["openml_tutorial_knn"]) +print(f"Flow was auto-published with ID {knn_flow.flow_id}") + +# %% [markdown] +# ### Option B: Manual flow construction (full control) +# For advanced use cases, you can manually construct the flow: + +# %% +# Uncomment to use manual flow construction: +# knn_flow_manual = openml.flows.OpenMLFlow( +# name="sklearn.neighbors.classification.KNeighborsClassifier(my_name)", +# class_name="sklearn.neighbors.classification.KNeighborsClassifier", +# description="KNeighborsClassifier(algorithm='brute', leaf_size=30, \n" +# "metric='minkowski', metric_params=None, n_jobs=-1, \n" +# "n_neighbors=5, p=2, weights='uniform')", +# model=clf, +# components=OrderedDict(), +# parameters=OrderedDict(), +# parameters_meta_dict=OrderedDict(), +# external_version="0.20.0", +# tags=["openml_tutorial_knn"], +# language="English", +# dependencies="sklearn==0.20.0\nnumpy>=1.6.1\nscipy>=0.9", +# ) +# knn_flow_manual.extension = extension +# knn_flow_manual = knn_flow_manual.publish(raise_error_if_exists=True) +# print(f"Manual flow URL: {knn_flow_manual.openml_url}") + +# %% [markdown] +# Now we'll use the auto-published flow to create and upload a run. # %% [markdown] # Second, we create a run to store the results associated with the flow. diff --git a/openml/__init__.py b/openml/__init__.py index d691bd22b..4d1b0bcd5 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -18,6 +18,7 @@ # License: BSD 3-Clause from __future__ import annotations +import contextlib from typing import Any, Sequence from . import ( @@ -56,10 +57,72 @@ def publish(obj: Any, *, name: str | None = None, tags: Sequence[str] | None = None) -> Any: """Publish a common object (flow/model/run/dataset) with minimal friction. - If ``obj`` is already an OpenML object (``OpenMLBase``) it will call its ``publish`` method. - Otherwise it looks for a registered extension (e.g., scikit-learn) to convert the object - into an ``OpenMLFlow`` and publish it. + This function provides a unified entry point for publishing various OpenML objects. + It automatically detects the object type and routes to the appropriate publishing + mechanism: + + - For OpenML objects (``OpenMLDataset``, ``OpenMLFlow``, ``OpenMLRun``, etc.), + it directly calls their ``publish()`` method. + - For external models (e.g., scikit-learn estimators), it uses registered + extensions to convert them to ``OpenMLFlow`` objects before publishing. + + Parameters + ---------- + obj : Any + The object to publish. Can be: + - An OpenML object (OpenMLDataset, OpenMLFlow, OpenMLRun, OpenMLTask) + - A machine learning model from a supported framework (e.g., scikit-learn) + name : str, optional + Override the default name for the published object. + If not provided, uses the object's default naming convention. + tags : Sequence[str], optional + Additional tags to attach to the published object. + Will be merged with any existing tags, removing duplicates while + preserving order. + + Returns + ------- + Any + The published object (typically with updated ID and metadata). + + Raises + ------ + ValueError + If no extension is registered to handle the provided model type. + + Examples + -------- + Publishing an OpenML dataset: + + >>> dataset = openml.datasets.get_dataset(61) + >>> openml.publish(dataset, tags=["example"]) + + Publishing a scikit-learn model: + + >>> from sklearn.tree import DecisionTreeClassifier + >>> clf = DecisionTreeClassifier(max_depth=5) + >>> openml.publish(clf, name="MyDecisionTree", tags=["tutorial"]) + + Publishing an OpenML flow directly: + + >>> flow = openml.flows.OpenMLFlow(...) + >>> openml.publish(flow) + + Publishing an OpenML run (after execution with predictions): + + >>> run = openml.runs.OpenMLRun( + ... task_id=1, flow_id=100, dataset_id=61, + ... data_content=predictions # predictions from model evaluation + ... ) + >>> openml.publish(run, tags=["experiment"]) + + Notes + ----- + For external models (e.g., scikit-learn), the corresponding extension must be + installed (e.g., ``openml-sklearn``). The extension will be automatically imported + if available. """ + # Case 1: Object is already an OpenML entity if isinstance(obj, OpenMLBase): if tags is not None and hasattr(obj, "tags"): existing = list(getattr(obj, "tags", []) or []) @@ -69,8 +132,12 @@ def publish(obj: Any, *, name: str | None = None, tags: Sequence[str] | None = N obj.name = name return obj.publish() + # Case 2: Object is an external model - use extension registry + # Attempt to auto-import common extensions + _ensure_extension_imported(obj) + extension = extensions.functions.get_extension_by_model(obj, raise_if_no_extension=True) - if extension is None: # defensive; should not happen with raise_if_no_extension=True + if extension is None: # Defensive check (should not occur with raise_if_no_extension=True) raise ValueError("No extension registered to handle the provided object.") flow = extension.model_to_flow(obj) @@ -84,6 +151,25 @@ def publish(obj: Any, *, name: str | None = None, tags: Sequence[str] | None = N return flow.publish() +def _ensure_extension_imported(obj: Any) -> None: + """Attempt to import the appropriate extension for common frameworks. + + This is a convenience helper to automatically import extensions for + well-known frameworks, reducing friction for users. + + Parameters + ---------- + obj : Any + The object to check. + """ + obj_module = type(obj).__module__ + + # Check for scikit-learn models + if obj_module.startswith("sklearn"): + with contextlib.suppress(ImportError): + import openml_sklearn # noqa: F401 + + def populate_cache( task_ids: list[int] | None = None, dataset_ids: list[int | str] | None = None, @@ -125,34 +211,34 @@ def populate_cache( __all__ = [ - "publish", - "OpenMLDataset", + "OpenMLBenchmarkSuite", + "OpenMLClassificationTask", + "OpenMLClusteringTask", "OpenMLDataFeature", - "OpenMLRun", - "OpenMLSplit", + "OpenMLDataset", "OpenMLEvaluation", - "OpenMLSetup", - "OpenMLParameter", - "OpenMLTask", - "OpenMLSupervisedTask", - "OpenMLClusteringTask", + "OpenMLFlow", "OpenMLLearningCurveTask", + "OpenMLParameter", "OpenMLRegressionTask", - "OpenMLClassificationTask", - "OpenMLFlow", + "OpenMLRun", + "OpenMLSetup", + "OpenMLSplit", "OpenMLStudy", - "OpenMLBenchmarkSuite", + "OpenMLSupervisedTask", + "OpenMLTask", + "__version__", + "_api_calls", + "config", "datasets", "evaluations", "exceptions", "extensions", - "config", - "runs", "flows", - "tasks", + "publish", + "runs", "setups", "study", + "tasks", "utils", - "_api_calls", - "__version__", ] From 3dfe34a6802a9965f8c9e3a1eb86759893349984 Mon Sep 17 00:00:00 2001 From: Omswastik-11 Date: Thu, 25 Dec 2025 13:29:23 +0530 Subject: [PATCH 3/4] update __init__.py --- openml/__init__.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/openml/__init__.py b/openml/__init__.py index 4d1b0bcd5..0ff233394 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -211,34 +211,34 @@ def populate_cache( __all__ = [ - "OpenMLBenchmarkSuite", - "OpenMLClassificationTask", - "OpenMLClusteringTask", - "OpenMLDataFeature", "OpenMLDataset", + "OpenMLDataFeature", + "OpenMLRun", + "OpenMLSplit", "OpenMLEvaluation", - "OpenMLFlow", - "OpenMLLearningCurveTask", + "OpenMLSetup", "OpenMLParameter", + "OpenMLTask", + "OpenMLSupervisedTask", + "OpenMLClusteringTask", + "OpenMLLearningCurveTask", "OpenMLRegressionTask", - "OpenMLRun", - "OpenMLSetup", - "OpenMLSplit", + "OpenMLClassificationTask", + "OpenMLFlow", "OpenMLStudy", - "OpenMLSupervisedTask", - "OpenMLTask", - "__version__", - "_api_calls", - "config", + "OpenMLBenchmarkSuite", "datasets", "evaluations", "exceptions", "extensions", - "flows", - "publish", + "config", "runs", + "flows", + "tasks", "setups", "study", - "tasks", "utils", + "_api_calls", + "__version__", + "publish", ] From db367783ec28085f6277c13b9a8ab287ff9d3438 Mon Sep 17 00:00:00 2001 From: Omswastik-11 Date: Thu, 25 Dec 2025 13:34:46 +0530 Subject: [PATCH 4/4] update examples --- .../Basics/simple_flows_and_runs_tutorial.py | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/examples/Basics/simple_flows_and_runs_tutorial.py b/examples/Basics/simple_flows_and_runs_tutorial.py index 05aea7a02..050f3353a 100644 --- a/examples/Basics/simple_flows_and_runs_tutorial.py +++ b/examples/Basics/simple_flows_and_runs_tutorial.py @@ -3,7 +3,7 @@ # %% from sklearn.neighbors import KNeighborsClassifier - +import sklearn import openml # %% [markdown] @@ -66,25 +66,25 @@ # For advanced use cases, you can manually construct the flow: # %% -# Uncomment to use manual flow construction: -# knn_flow_manual = openml.flows.OpenMLFlow( -# name="sklearn.neighbors.classification.KNeighborsClassifier(my_name)", -# class_name="sklearn.neighbors.classification.KNeighborsClassifier", -# description="KNeighborsClassifier(algorithm='brute', leaf_size=30, \n" -# "metric='minkowski', metric_params=None, n_jobs=-1, \n" -# "n_neighbors=5, p=2, weights='uniform')", -# model=clf, -# components=OrderedDict(), -# parameters=OrderedDict(), -# parameters_meta_dict=OrderedDict(), -# external_version="0.20.0", -# tags=["openml_tutorial_knn"], -# language="English", -# dependencies="sklearn==0.20.0\nnumpy>=1.6.1\nscipy>=0.9", -# ) -# knn_flow_manual.extension = extension -# knn_flow_manual = knn_flow_manual.publish(raise_error_if_exists=True) -# print(f"Manual flow URL: {knn_flow_manual.openml_url}") +knn_flow = openml.flows.OpenMLFlow( + # Metadata + model=clf, # or None, if you do not want to upload the model object. + name="CustomKNeighborsClassifier", + description="A custom KNeighborsClassifier flow for OpenML.", + external_version=f"{sklearn.__version__}", + language="English", + tags=["openml_tutorial_knn"], + dependencies=f"{sklearn.__version__}", + # Hyperparameters + parameters={k: str(v) for k, v in knn_parameters.items()}, + parameters_meta_info={ + "n_neighbors": {"description": "number of neighbors to use", "data_type": "int"} + }, + # If you have a pipeline with subcomponents, such as preprocessing, add them here. + components={}, +) +knn_flow.publish() +print(f"knn_flow was published with the ID {knn_flow.flow_id}") # %% [markdown] # Now we'll use the auto-published flow to create and upload a run.