From bb4554f441cb31fadb848c4e1645bdea6a0b99bf Mon Sep 17 00:00:00 2001 From: Omswastik-11 Date: Wed, 24 Dec 2025 00:18:32 +0530 Subject: [PATCH] improved the Getter API for users --- examples/Advanced/tasks_tutorial.py | 28 +++++-- examples/Basics/simple_datasets_tutorial.py | 12 ++- .../Basics/simple_flows_and_runs_tutorial.py | 15 +++- examples/Basics/simple_tasks_tutorial.py | 5 +- openml/__init__.py | 75 ++++++++++++++++++- tests/test_openml/test_openml.py | 24 ++++++ 6 files changed, 143 insertions(+), 16 deletions(-) diff --git a/examples/Advanced/tasks_tutorial.py b/examples/Advanced/tasks_tutorial.py index dff7293ad..1418aa91c 100644 --- a/examples/Advanced/tasks_tutorial.py +++ b/examples/Advanced/tasks_tutorial.py @@ -24,13 +24,15 @@ # # We will start by simply listing only *supervised classification* tasks. # -# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we -# request a +# **openml.list("task")** (or **openml.tasks.list_tasks()**) returns a dictionary of +# dictionaries by default, but we request a # [pandas dataframe](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) # instead to have better visualization capabilities and easier access: # %% -tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION) +tasks = openml.list("task", task_type=TaskType.SUPERVISED_CLASSIFICATION) +# Legacy path still works: +# tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION) print(tasks.columns) print(f"First 5 of {len(tasks)} tasks:") print(tasks.head()) @@ -66,7 +68,9 @@ # Similar to listing tasks by task type, we can list tasks by tags: # %% -tasks = openml.tasks.list_tasks(tag="OpenML100") +tasks = openml.list("task", tag="OpenML100") +# Legacy path still works: +# tasks = openml.tasks.list_tasks(tag="OpenML100") print(f"First 5 of {len(tasks)} tasks:") print(tasks.head()) @@ -74,7 +78,9 @@ # Furthermore, we can list tasks based on the dataset id: # %% -tasks = openml.tasks.list_tasks(data_id=1471) +tasks = openml.list("task", data_id=1471) +# Legacy path still works: +# tasks = openml.tasks.list_tasks(data_id=1471) print(f"First 5 of {len(tasks)} tasks:") print(tasks.head()) @@ -82,7 +88,9 @@ # In addition, a size limit and an offset can be applied both separately and simultaneously: # %% -tasks = openml.tasks.list_tasks(size=10, offset=50) +tasks = openml.list("task", size=10, offset=50) +# Legacy path still works: +# tasks = openml.tasks.list_tasks(size=10, offset=50) print(tasks) # %% [markdown] @@ -98,7 +106,9 @@ # Finally, it is also possible to list all tasks on OpenML with: # %% -tasks = openml.tasks.list_tasks() +tasks = openml.list("task") +# Legacy path still works: +# tasks = openml.tasks.list_tasks() print(len(tasks)) # %% [markdown] @@ -118,7 +128,9 @@ # %% task_id = 31 -task = openml.tasks.get_task(task_id) +task = openml.get("task", task_id) +# Legacy path still works: +# task = openml.tasks.get_task(task_id) # %% # Properties of the task are stored as member variables: diff --git a/examples/Basics/simple_datasets_tutorial.py b/examples/Basics/simple_datasets_tutorial.py index 75d36ed0f..6d90c22cb 100644 --- a/examples/Basics/simple_datasets_tutorial.py +++ b/examples/Basics/simple_datasets_tutorial.py @@ -14,15 +14,23 @@ # ## List datasets stored on OpenML # %% -datasets_df = openml.datasets.list_datasets() +datasets_df = openml.list("dataset") print(datasets_df.head(n=10)) +# Legacy path still works: +# datasets_df = openml.datasets.list_datasets() + # %% [markdown] # ## Download a dataset # %% # Iris dataset https://www.openml.org/d/61 -dataset = openml.datasets.get_dataset(dataset_id=61) +dataset = openml.get("dataset", 61) +# You can also fetch by name: +# dataset = openml.get("dataset", "Fashion-MNIST") + +# Legacy path still works: +# dataset = openml.datasets.get_dataset(dataset_id=61) # Print a summary print( diff --git a/examples/Basics/simple_flows_and_runs_tutorial.py b/examples/Basics/simple_flows_and_runs_tutorial.py index 41eed9234..f99685f6d 100644 --- a/examples/Basics/simple_flows_and_runs_tutorial.py +++ b/examples/Basics/simple_flows_and_runs_tutorial.py @@ -24,12 +24,25 @@ # %% openml.config.start_using_configuration_for_example() +# %% [markdown] +# ## Quick: list flows and runs via unified entrypoints + +# %% +flows_df = openml.list("flow", size=3) +print(flows_df.head()) + +runs_df = openml.list("run", size=3) +print(runs_df.head()) + # %% [markdown] # ## Train a machine learning model and evaluate it # NOTE: We are using task 119 from the test server: https://test.openml.org/d/20 # %% -task = openml.tasks.get_task(119) +task = openml.get("task", 119) + +# Legacy path still works: +# task = openml.tasks.get_task(119) # Get the data dataset = task.get_dataset() diff --git a/examples/Basics/simple_tasks_tutorial.py b/examples/Basics/simple_tasks_tutorial.py index 598ce4e71..0989d3e1d 100644 --- a/examples/Basics/simple_tasks_tutorial.py +++ b/examples/Basics/simple_tasks_tutorial.py @@ -10,7 +10,10 @@ # [supervised classification on credit-g](https://www.openml.org/search?type=task&id=31&source_data.data_id=31): # %% -task = openml.tasks.get_task(31) +task = openml.get("task", 31) + +# Legacy path still works: +# task = openml.tasks.get_task(31) # %% [markdown] # Get the dataset and its data from the task. diff --git a/openml/__init__.py b/openml/__init__.py index c49505eb9..81aa7b44a 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -18,6 +18,9 @@ # License: BSD 3-Clause from __future__ import annotations +import builtins +from typing import Any, Callable, Dict + from . import ( _api_calls, config, @@ -49,12 +52,74 @@ OpenMLTask, ) +ListDispatcher = Dict[str, Callable[..., Any]] +GetDispatcher = Dict[str, Callable[..., Any]] + + +def list(object_type: str, /, **kwargs: Any) -> Any: # noqa: A001 + """List OpenML objects by type (e.g., datasets, tasks, flows, runs). + + This is a convenience dispatcher that forwards to the existing type-specific + ``list_*`` functions. Existing imports remain available for backward compatibility. + """ + dispatch: ListDispatcher = { + "dataset": datasets.functions.list_datasets, + "task": tasks.functions.list_tasks, + "flow": flows.functions.list_flows, + "run": runs.functions.list_runs, + } + + try: + func = dispatch[object_type.lower()] + except KeyError as exc: # pragma: no cover - defensive branch + raise ValueError( + "Unsupported object_type for list; expected one of 'dataset', 'task', 'flow', 'run'.", + ) from exc + + return func(**kwargs) + + +def get(object_type_or_name: Any, identifier: Any | None = None, /, **kwargs: Any) -> Any: + """Get an OpenML object by type and identifier, or a dataset by name. + + Examples + -------- + openml.get("dataset", 61) + openml.get("dataset", "Fashion-MNIST") + openml.get("task", 31) + openml.get("flow", 10) + openml.get("run", 20) + openml.get("Fashion-MNIST") # dataset lookup by name (no type specified) + """ + # Single-argument shortcut: treat string without type as dataset lookup. + if identifier is None: + if isinstance(object_type_or_name, str): + return datasets.functions.get_dataset(object_type_or_name, **kwargs) + raise ValueError("Please provide an object_type when identifier is not provided.") + + object_type = str(object_type_or_name).lower() + dispatch: GetDispatcher = { + "dataset": datasets.functions.get_dataset, + "task": tasks.functions.get_task, + "flow": flows.functions.get_flow, + "run": runs.functions.get_run, + } + + try: + func = dispatch[object_type] + except KeyError as exc: # pragma: no cover - defensive branch + raise ValueError( + "Unsupported object_type for get; expected one of 'dataset', 'task', 'flow', 'run'.", + ) from exc + + return func(identifier, **kwargs) + def populate_cache( - task_ids: list[int] | None = None, - dataset_ids: list[int | str] | None = None, - flow_ids: list[int] | None = None, - run_ids: list[int] | None = None, + task_ids: builtins.list[int] | None = None, + dataset_ids: builtins.list[int | str] | None = None, + flow_ids: builtins.list[int] | None = None, + run_ids: builtins.list[int] | None = None, ) -> None: """ Populate a cache for offline and parallel usage of the OpenML connector. @@ -91,6 +156,8 @@ def populate_cache( __all__ = [ + "list", + "get", "OpenMLDataset", "OpenMLDataFeature", "OpenMLRun", diff --git a/tests/test_openml/test_openml.py b/tests/test_openml/test_openml.py index 998046726..0cd3b8211 100644 --- a/tests/test_openml/test_openml.py +++ b/tests/test_openml/test_openml.py @@ -41,3 +41,27 @@ def test_populate_cache( assert task_mock.call_count == 2 for argument, fixture in zip(task_mock.call_args_list, [(1,), (2,)]): assert argument[0] == fixture + + @mock.patch("openml.tasks.functions.list_tasks") + @mock.patch("openml.datasets.functions.list_datasets") + def test_list_dispatch(self, list_datasets_mock, list_tasks_mock): + openml.list("dataset", output_format="dataframe") + list_datasets_mock.assert_called_once_with(output_format="dataframe") + + openml.list("task", size=5) + list_tasks_mock.assert_called_once_with(size=5) + + @mock.patch("openml.tasks.functions.get_task") + @mock.patch("openml.datasets.functions.get_dataset") + def test_get_dispatch(self, get_dataset_mock, get_task_mock): + openml.get("dataset", 61) + get_dataset_mock.assert_called_with(61) + + openml.get("dataset", "Fashion-MNIST", version=2) + get_dataset_mock.assert_called_with("Fashion-MNIST", version=2) + + openml.get("Fashion-MNIST") + get_dataset_mock.assert_called_with("Fashion-MNIST") + + openml.get("task", 31) + get_task_mock.assert_called_with(31)