Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 20 additions & 8 deletions examples/Advanced/tasks_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@
#
# We will start by simply listing only *supervised classification* tasks.
#
# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we
# request a
# **openml.list("task")** (or **openml.tasks.list_tasks()**) returns a dictionary of
# dictionaries by default, but we request a
# [pandas dataframe](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)
# instead to have better visualization capabilities and easier access:

# %%
tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
tasks = openml.list("task", task_type=TaskType.SUPERVISED_CLASSIFICATION)
# Legacy path still works:
# tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
print(tasks.columns)
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())
Expand Down Expand Up @@ -66,23 +68,29 @@
# Similar to listing tasks by task type, we can list tasks by tags:

# %%
tasks = openml.tasks.list_tasks(tag="OpenML100")
tasks = openml.list("task", tag="OpenML100")
# Legacy path still works:
# tasks = openml.tasks.list_tasks(tag="OpenML100")
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())

# %% [markdown]
# Furthermore, we can list tasks based on the dataset id:

# %%
tasks = openml.tasks.list_tasks(data_id=1471)
tasks = openml.list("task", data_id=1471)
# Legacy path still works:
# tasks = openml.tasks.list_tasks(data_id=1471)
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())

# %% [markdown]
# In addition, a size limit and an offset can be applied both separately and simultaneously:

# %%
tasks = openml.tasks.list_tasks(size=10, offset=50)
tasks = openml.list("task", size=10, offset=50)
# Legacy path still works:
# tasks = openml.tasks.list_tasks(size=10, offset=50)
print(tasks)

# %% [markdown]
Expand All @@ -98,7 +106,9 @@
# Finally, it is also possible to list all tasks on OpenML with:

# %%
tasks = openml.tasks.list_tasks()
tasks = openml.list("task")
# Legacy path still works:
# tasks = openml.tasks.list_tasks()
print(len(tasks))

# %% [markdown]
Expand All @@ -118,7 +128,9 @@

# %%
task_id = 31
task = openml.tasks.get_task(task_id)
task = openml.get("task", task_id)
# Legacy path still works:
# task = openml.tasks.get_task(task_id)

# %%
# Properties of the task are stored as member variables:
Expand Down
12 changes: 10 additions & 2 deletions examples/Basics/simple_datasets_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,23 @@
# ## List datasets stored on OpenML

# %%
datasets_df = openml.datasets.list_datasets()
datasets_df = openml.list("dataset")
print(datasets_df.head(n=10))

# Legacy path still works:
# datasets_df = openml.datasets.list_datasets()

# %% [markdown]
# ## Download a dataset

# %%
# Iris dataset https://www.openml.org/d/61
dataset = openml.datasets.get_dataset(dataset_id=61)
dataset = openml.get("dataset", 61)
# You can also fetch by name:
# dataset = openml.get("dataset", "Fashion-MNIST")

# Legacy path still works:
# dataset = openml.datasets.get_dataset(dataset_id=61)

# Print a summary
print(
Expand Down
15 changes: 14 additions & 1 deletion examples/Basics/simple_flows_and_runs_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,25 @@
# %%
openml.config.start_using_configuration_for_example()

# %% [markdown]
# ## Quick: list flows and runs via unified entrypoints

# %%
flows_df = openml.list("flow", size=3)
print(flows_df.head())

runs_df = openml.list("run", size=3)
print(runs_df.head())

# %% [markdown]
# ## Train a machine learning model and evaluate it
# NOTE: We are using task 119 from the test server: https://test.openml.org/d/20

# %%
task = openml.tasks.get_task(119)
task = openml.get("task", 119)

# Legacy path still works:
# task = openml.tasks.get_task(119)

# Get the data
dataset = task.get_dataset()
Expand Down
5 changes: 4 additions & 1 deletion examples/Basics/simple_tasks_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@
# [supervised classification on credit-g](https://www.openml.org/search?type=task&id=31&source_data.data_id=31):

# %%
task = openml.tasks.get_task(31)
task = openml.get("task", 31)

# Legacy path still works:
# task = openml.tasks.get_task(31)

# %% [markdown]
# Get the dataset and its data from the task.
Expand Down
75 changes: 71 additions & 4 deletions openml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
# License: BSD 3-Clause
from __future__ import annotations

import builtins
from typing import Any, Callable, Dict

from . import (
_api_calls,
config,
Expand Down Expand Up @@ -49,12 +52,74 @@
OpenMLTask,
)

ListDispatcher = Dict[str, Callable[..., Any]]
GetDispatcher = Dict[str, Callable[..., Any]]


def list(object_type: str, /, **kwargs: Any) -> Any: # noqa: A001
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

list is not a good name, as it overloads python list - we should avoid that!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what other good names are there?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe list_all

"""List OpenML objects by type (e.g., datasets, tasks, flows, runs).

This is a convenience dispatcher that forwards to the existing type-specific
``list_*`` functions. Existing imports remain available for backward compatibility.
"""
dispatch: ListDispatcher = {
"dataset": datasets.functions.list_datasets,
"task": tasks.functions.list_tasks,
"flow": flows.functions.list_flows,
"run": runs.functions.list_runs,
}

try:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you should really stop abusing try/except for case distinctions.
This is not good style, since you cannot distinguish actual exceptions from the try-block with the intended exception.

Instead, use if/else with a precise condition. In this case, you can also:

  • use dict.get, and then check if None was retrieved.
  • do an input check on object_type

func = dispatch[object_type.lower()]
except KeyError as exc: # pragma: no cover - defensive branch
raise ValueError(
"Unsupported object_type for list; expected one of 'dataset', 'task', 'flow', 'run'.",
) from exc

return func(**kwargs)


def get(object_type_or_name: Any, identifier: Any | None = None, /, **kwargs: Any) -> Any:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the first arg can be two different things, I would avoid that - instead, I would do one of two things:

  • use *, the argument syntax
  • make the identifier first, and object_type second

"""Get an OpenML object by type and identifier, or a dataset by name.

Examples
--------
openml.get("dataset", 61)
openml.get("dataset", "Fashion-MNIST")
openml.get("task", 31)
openml.get("flow", 10)
openml.get("run", 20)
openml.get("Fashion-MNIST") # dataset lookup by name (no type specified)
"""
# Single-argument shortcut: treat string without type as dataset lookup.
if identifier is None:
if isinstance(object_type_or_name, str):
return datasets.functions.get_dataset(object_type_or_name, **kwargs)
raise ValueError("Please provide an object_type when identifier is not provided.")

object_type = str(object_type_or_name).lower()
dispatch: GetDispatcher = {
"dataset": datasets.functions.get_dataset,
"task": tasks.functions.get_task,
"flow": flows.functions.get_flow,
"run": runs.functions.get_run,
}

try:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

again a try/except that you should avoid.

func = dispatch[object_type]
except KeyError as exc: # pragma: no cover - defensive branch
raise ValueError(
"Unsupported object_type for get; expected one of 'dataset', 'task', 'flow', 'run'.",
) from exc

return func(identifier, **kwargs)


def populate_cache(
task_ids: list[int] | None = None,
dataset_ids: list[int | str] | None = None,
flow_ids: list[int] | None = None,
run_ids: list[int] | None = None,
task_ids: builtins.list[int] | None = None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are you changing this?

dataset_ids: builtins.list[int | str] | None = None,
flow_ids: builtins.list[int] | None = None,
run_ids: builtins.list[int] | None = None,
) -> None:
"""
Populate a cache for offline and parallel usage of the OpenML connector.
Expand Down Expand Up @@ -91,6 +156,8 @@ def populate_cache(


__all__ = [
"list",
"get",
"OpenMLDataset",
"OpenMLDataFeature",
"OpenMLRun",
Expand Down
24 changes: 24 additions & 0 deletions tests/test_openml/test_openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,27 @@ def test_populate_cache(
assert task_mock.call_count == 2
for argument, fixture in zip(task_mock.call_args_list, [(1,), (2,)]):
assert argument[0] == fixture

@mock.patch("openml.tasks.functions.list_tasks")
@mock.patch("openml.datasets.functions.list_datasets")
def test_list_dispatch(self, list_datasets_mock, list_tasks_mock):
openml.list("dataset", output_format="dataframe")
list_datasets_mock.assert_called_once_with(output_format="dataframe")

openml.list("task", size=5)
list_tasks_mock.assert_called_once_with(size=5)

@mock.patch("openml.tasks.functions.get_task")
@mock.patch("openml.datasets.functions.get_dataset")
def test_get_dispatch(self, get_dataset_mock, get_task_mock):
openml.get("dataset", 61)
get_dataset_mock.assert_called_with(61)

openml.get("dataset", "Fashion-MNIST", version=2)
get_dataset_mock.assert_called_with("Fashion-MNIST", version=2)

openml.get("Fashion-MNIST")
get_dataset_mock.assert_called_with("Fashion-MNIST")

openml.get("task", 31)
get_task_mock.assert_called_with(31)