Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ target/
.idea
*.swp
.vscode
.cursorignore
.cursorindexingignore

# MYPY
.mypy_cache
Expand All @@ -96,4 +98,7 @@ dmypy.sock

# Tests
.pytest_cache
.venv
.venv

# Ruff
.ruff-cache/
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ files: |
)/.*\.py$
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.7.3
rev: v0.14.10
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix, --no-cache]
Expand Down
6 changes: 3 additions & 3 deletions examples/Advanced/fetch_evaluations_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@

def plot_cdf(values, metric="predictive_accuracy"):
max_val = max(values)
n, bins, patches = plt.hist(values, density=True, histtype="step", cumulative=True, linewidth=3)
_, _, patches = plt.hist(values, density=True, histtype="step", cumulative=True, linewidth=3)
patches[0].set_xy(patches[0].get_xy()[:-1])
plt.xlim(max(0, min(values) - 0.1), 1)
plt.title("CDF")
Expand Down Expand Up @@ -116,7 +116,7 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
for i in range(len(flow_ids)):
flow_values = evaluations[evaluations.flow_id == flow_ids[i]].value
df = pd.concat([df, flow_values], ignore_index=True, axis=1)
fig, axs = plt.subplots()
_, axs = plt.subplots()
df.boxplot()
axs.set_title("Boxplot comparing " + metric + " for different flows")
axs.set_ylabel(metric)
Expand Down Expand Up @@ -178,4 +178,4 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
function="predictive_accuracy", flows=[6767], size=100, parameters_in_separate_columns=True
)

print(evals_setups.head(10))
print(evals_setups.head(10))
4 changes: 2 additions & 2 deletions examples/Basics/introduction_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# For certain functionality, such as uploading tasks or datasets, users have to
# sign up. Only accessing the data on OpenML does not require an account!
#
# If you don’t have an account yet, sign up now.
# If you dont have an account yet, sign up now.
# You will receive an API key, which will authenticate you to the server
# and allow you to download and upload datasets, tasks, runs and flows.
#
Expand Down Expand Up @@ -52,4 +52,4 @@
# %%
import openml

openml.config.set_root_cache_directory("YOURDIR")
openml.config.set_root_cache_directory("YOURDIR")
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,10 @@
| Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter
| In *Advances in Neural Information Processing Systems 28*, 2015
| Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
""" # noqa F401
"""

# License: BSD 3-Clause

import pandas as pd

import openml

####################################################################################################
Expand Down Expand Up @@ -68,7 +66,7 @@

task_ids = []
for did in dataset_ids:
tasks_ = list(tasks.query("did == {}".format(did)).tid)
tasks_ = list(tasks.query(f"did == {did}").tid)
if len(tasks_) >= 1: # if there are multiple task, take the one with lowest ID (oldest).
task_id = min(tasks_)
else:
Expand Down
7 changes: 3 additions & 4 deletions examples/_external_or_deprecated/2018_ida_strang_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
# License: BSD 3-Clause

import matplotlib.pyplot as plt

import openml
import pandas as pd

##############################################################################
# A basic step for each data-mining or machine learning task is to determine
Expand Down Expand Up @@ -86,10 +86,9 @@
def determine_class(val_lin, val_nonlin):
if val_lin < val_nonlin:
return class_values[0]
elif val_nonlin < val_lin:
if val_nonlin < val_lin:
return class_values[1]
else:
return class_values[2]
return class_values[2]


evaluations["class"] = evaluations.apply(
Expand Down
23 changes: 12 additions & 11 deletions examples/_external_or_deprecated/2018_kdd_rijn_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,24 +32,24 @@

import sys

if sys.platform == "win32": # noqa
if sys.platform == "win32":
print(
"The pyrfr library (requirement of fanova) can currently not be installed on Windows systems"
)
exit()
sys.exit()

# DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline
print("This example is deprecated, remove the `if False` in this code to use it manually.")
if False:
import json

import fanova
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import openml


##############################################################################
# With the advent of automated machine learning, automated hyperparameter
# optimization methods are by now routinely used in data mining. However, this
Expand Down Expand Up @@ -80,7 +80,7 @@
# important when it is put on a log-scale. All these simplifications can be
# addressed by defining a ConfigSpace. For a more elaborated example that uses
# this, please see:
# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401
# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py

suite = openml.study.get_suite("OpenML100")
flow_id = 7707
Expand All @@ -97,8 +97,7 @@
if limit_nr_tasks is not None and idx >= limit_nr_tasks:
continue
print(
"Starting with task %d (%d/%d)"
% (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks)
f"Starting with task {task_id} ({idx + 1}/{len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks})"
)
# note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop)
evals = openml.evaluations.list_evaluations_setups(
Expand All @@ -121,13 +120,13 @@
[
dict(
**{name: json.loads(value) for name, value in setup["parameters"].items()},
**{performance_column: setup[performance_column]}
**{performance_column: setup[performance_column]},
)
for _, setup in evals.iterrows()
]
)
except json.decoder.JSONDecodeError as e:
print("Task %d error: %s" % (task_id, e))
print(f"Task {task_id} error: {e}")
continue
# apply our filters, to have only the setups that comply to the hyperparameters we want
for filter_key, filter_value in parameter_filters.items():
Expand Down Expand Up @@ -156,19 +155,21 @@
Y=setups_evals[performance_column].to_numpy(),
n_trees=n_trees,
)
for idx, pname in enumerate(parameter_names):
for idx, pname in enumerate(parameter_names): # noqa: PLW2901
try:
fanova_results.append(
{
"hyperparameter": pname.split(".")[-1],
"fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"],
"fanova": evaluator.quantify_importance([idx])[(idx,)][
"individual importance"
],
}
)
except RuntimeError as e:
# functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant
# for all configurations (there is no variance). We will skip these tasks (like the authors did in the
# paper).
print("Task %d error: %s" % (task_id, e))
print(f"Task {task_id} error: {e}")
continue

# transform ``fanova_results`` from a list of dicts into a DataFrame
Expand Down
18 changes: 11 additions & 7 deletions examples/_external_or_deprecated/2018_neurips_perrone_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,24 +27,25 @@

# License: BSD 3-Clause

import openml
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

import openml

flow_type = "svm" # this example will use the smaller svm flow evaluations
############################################################################
# The subsequent functions are defined to fetch tasks, flows, evaluations and preprocess them into
# a tabular format that can be used to build models.


def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_curve"):
def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_curve"): # noqa: FBT002
"""
Fetch a list of evaluations based on the flows and tasks used in the experiments.

Expand Down Expand Up @@ -101,7 +102,10 @@ def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_cu


def create_table_from_evaluations(
eval_df, flow_type="svm", run_count=np.iinfo(np.int64).max, task_ids=None
eval_df,
flow_type="svm",
run_count=np.iinfo(np.int64).max, # noqa: B008
task_ids=None,
):
"""
Create a tabular data with its ground truth from a dataframe of evaluations.
Expand Down Expand Up @@ -206,7 +210,7 @@ def list_categorical_attributes(flow_type="svm"):
model.fit(X, y)
y_pred = model.predict(X)

print("Training RMSE : {:.5}".format(mean_squared_error(y, y_pred)))
print(f"Training RMSE : {mean_squared_error(y, y_pred):.5}")


#############################################################################
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def objective(trial: optuna.Trial) -> Pipeline:
run.publish()

logger.log(1, f"Run was uploaded to - {run.openml_url}")
except Exception as e:
except Exception as e: # noqa: BLE001
logger.log(1, f"Could not publish run - {e}")
else:
logger.log(
Expand Down
31 changes: 10 additions & 21 deletions examples/_external_or_deprecated/fetch_runtimes_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,16 @@
#
# * (Case 5) Running models that do not release the Python Global Interpreter Lock (GIL)

import openml
import numpy as np
from matplotlib import pyplot as plt
from joblib.parallel import parallel_backend

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

import openml

# %% [markdown]
# # Preparing tasks and scikit-learn models
Expand All @@ -63,12 +62,7 @@
# Viewing associated data
n_repeats, n_folds, n_samples = task.get_split_dimensions()
print(
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
task_id,
n_repeats,
n_folds,
n_samples,
)
f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}."
)


Expand Down Expand Up @@ -101,7 +95,7 @@ def print_compare_runtimes(measures):
measures = run1.fold_evaluations

print("The timing and performance metrics available: ")
for key in measures.keys():
for key in measures:
print(key)
print()

Expand Down Expand Up @@ -206,7 +200,6 @@ def print_compare_runtimes(measures):
# included in the `wall_clock_time_millis_training` measure recorded.

# %%
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier(n_estimators=10, n_jobs=2)

Expand Down Expand Up @@ -284,22 +277,18 @@ def print_compare_runtimes(measures):

# %%


def extract_refit_time(run, repeat, fold):
refit_time = (
return (
run.fold_evaluations["wall_clock_time_millis"][repeat][fold]
- run.fold_evaluations["wall_clock_time_millis_training"][repeat][fold]
- run.fold_evaluations["wall_clock_time_millis_testing"][repeat][fold]
)
return refit_time


for repeat in range(n_repeats):
for fold in range(n_folds):
print(
"Repeat #{}-Fold #{}: {:.4f}".format(
repeat, fold, extract_refit_time(run4, repeat, fold)
)
)
print(f"Repeat #{repeat}-Fold #{fold}: {extract_refit_time(run4, repeat, fold):.4f}")

# %% [markdown]
# Along with the GridSearchCV already used above, we demonstrate how such
Expand Down
3 changes: 1 addition & 2 deletions examples/_external_or_deprecated/flow_id_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

import openml


# %% [markdown]
# .. warning::
# .. include:: ../../test_server_usage_warning.txt
Expand Down Expand Up @@ -48,7 +47,7 @@
# %% [markdown]
# ## 2. Obtaining a flow given its name
# The schema of a flow is given in XSD (
# [here](https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.implementation.upload.xsd)). # noqa E501
# [here](https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.implementation.upload.xsd)).
# Only two fields are required, a unique name, and an external version. While it should be pretty
# obvious why we need a name, the need for the additional external version information might not
# be immediately clear. However, this information is very important as it allows to have multiple
Expand Down
3 changes: 1 addition & 2 deletions examples/_external_or_deprecated/flows_and_runs_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
# This tutorial covers how to train/run a model and how to upload the results.

# %%
import openml
from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
from sklearn import compose, ensemble, impute, neighbors, pipeline, preprocessing, tree

import openml

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
# # Plotting hyperparameter surfaces

# %%
import openml
import numpy as np

import openml

# %% [markdown]
# # First step - obtaining the data
# First, we need to choose an SVM flow, for example 8353, and a task. Finding the IDs of them are
Expand Down
Loading