diff --git a/.gitignore b/.gitignore index 132070bf3..92679e5ca 100644 --- a/.gitignore +++ b/.gitignore @@ -88,6 +88,8 @@ target/ .idea *.swp .vscode +.cursorignore +.cursorindexingignore # MYPY .mypy_cache @@ -96,4 +98,7 @@ dmypy.sock # Tests .pytest_cache -.venv \ No newline at end of file +.venv + +# Ruff +.ruff-cache/ \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 95e2a5239..0987bad90 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ files: | )/.*\.py$ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.7.3 + rev: v0.14.10 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix, --no-cache] diff --git a/examples/Advanced/fetch_evaluations_tutorial.py b/examples/Advanced/fetch_evaluations_tutorial.py index 1b759423b..97b8d1bef 100644 --- a/examples/Advanced/fetch_evaluations_tutorial.py +++ b/examples/Advanced/fetch_evaluations_tutorial.py @@ -75,7 +75,7 @@ def plot_cdf(values, metric="predictive_accuracy"): max_val = max(values) - n, bins, patches = plt.hist(values, density=True, histtype="step", cumulative=True, linewidth=3) + _, _, patches = plt.hist(values, density=True, histtype="step", cumulative=True, linewidth=3) patches[0].set_xy(patches[0].get_xy()[:-1]) plt.xlim(max(0, min(values) - 0.1), 1) plt.title("CDF") @@ -116,7 +116,7 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"): for i in range(len(flow_ids)): flow_values = evaluations[evaluations.flow_id == flow_ids[i]].value df = pd.concat([df, flow_values], ignore_index=True, axis=1) - fig, axs = plt.subplots() + _, axs = plt.subplots() df.boxplot() axs.set_title("Boxplot comparing " + metric + " for different flows") axs.set_ylabel(metric) @@ -178,4 +178,4 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"): function="predictive_accuracy", flows=[6767], size=100, parameters_in_separate_columns=True ) -print(evals_setups.head(10)) \ No newline at end of file +print(evals_setups.head(10)) diff --git a/examples/Basics/introduction_tutorial.py b/examples/Basics/introduction_tutorial.py index c864772f5..24a71d8e0 100644 --- a/examples/Basics/introduction_tutorial.py +++ b/examples/Basics/introduction_tutorial.py @@ -12,7 +12,7 @@ # For certain functionality, such as uploading tasks or datasets, users have to # sign up. Only accessing the data on OpenML does not require an account! # -# If you don’t have an account yet, sign up now. +# If you dont have an account yet, sign up now. # You will receive an API key, which will authenticate you to the server # and allow you to download and upload datasets, tasks, runs and flows. # @@ -52,4 +52,4 @@ # %% import openml -openml.config.set_root_cache_directory("YOURDIR") \ No newline at end of file +openml.config.set_root_cache_directory("YOURDIR") diff --git a/examples/_external_or_deprecated/2015_neurips_feurer_example.py b/examples/_external_or_deprecated/2015_neurips_feurer_example.py index ae59c9ced..2dfc4bb97 100644 --- a/examples/_external_or_deprecated/2015_neurips_feurer_example.py +++ b/examples/_external_or_deprecated/2015_neurips_feurer_example.py @@ -13,12 +13,10 @@ | Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter | In *Advances in Neural Information Processing Systems 28*, 2015 | Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf -""" # noqa F401 +""" # License: BSD 3-Clause -import pandas as pd - import openml #################################################################################################### @@ -68,7 +66,7 @@ task_ids = [] for did in dataset_ids: - tasks_ = list(tasks.query("did == {}".format(did)).tid) + tasks_ = list(tasks.query(f"did == {did}").tid) if len(tasks_) >= 1: # if there are multiple task, take the one with lowest ID (oldest). task_id = min(tasks_) else: diff --git a/examples/_external_or_deprecated/2018_ida_strang_example.py b/examples/_external_or_deprecated/2018_ida_strang_example.py index 8b225125b..0e180badf 100644 --- a/examples/_external_or_deprecated/2018_ida_strang_example.py +++ b/examples/_external_or_deprecated/2018_ida_strang_example.py @@ -17,8 +17,8 @@ # License: BSD 3-Clause import matplotlib.pyplot as plt + import openml -import pandas as pd ############################################################################## # A basic step for each data-mining or machine learning task is to determine @@ -86,10 +86,9 @@ def determine_class(val_lin, val_nonlin): if val_lin < val_nonlin: return class_values[0] - elif val_nonlin < val_lin: + if val_nonlin < val_lin: return class_values[1] - else: - return class_values[2] + return class_values[2] evaluations["class"] = evaluations.apply( diff --git a/examples/_external_or_deprecated/2018_kdd_rijn_example.py b/examples/_external_or_deprecated/2018_kdd_rijn_example.py index 6522013e3..957281616 100644 --- a/examples/_external_or_deprecated/2018_kdd_rijn_example.py +++ b/examples/_external_or_deprecated/2018_kdd_rijn_example.py @@ -32,16 +32,17 @@ import sys -if sys.platform == "win32": # noqa +if sys.platform == "win32": print( "The pyrfr library (requirement of fanova) can currently not be installed on Windows systems" ) - exit() + sys.exit() # DEPRECATED EXAMPLE -- Avoid running this code in our CI/CD pipeline print("This example is deprecated, remove the `if False` in this code to use it manually.") if False: import json + import fanova import matplotlib.pyplot as plt import pandas as pd @@ -49,7 +50,6 @@ import openml - ############################################################################## # With the advent of automated machine learning, automated hyperparameter # optimization methods are by now routinely used in data mining. However, this @@ -80,7 +80,7 @@ # important when it is put on a log-scale. All these simplifications can be # addressed by defining a ConfigSpace. For a more elaborated example that uses # this, please see: - # https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401 + # https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py suite = openml.study.get_suite("OpenML100") flow_id = 7707 @@ -97,8 +97,7 @@ if limit_nr_tasks is not None and idx >= limit_nr_tasks: continue print( - "Starting with task %d (%d/%d)" - % (task_id, idx + 1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks) + f"Starting with task {task_id} ({idx + 1}/{len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks})" ) # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop) evals = openml.evaluations.list_evaluations_setups( @@ -121,13 +120,13 @@ [ dict( **{name: json.loads(value) for name, value in setup["parameters"].items()}, - **{performance_column: setup[performance_column]} + **{performance_column: setup[performance_column]}, ) for _, setup in evals.iterrows() ] ) except json.decoder.JSONDecodeError as e: - print("Task %d error: %s" % (task_id, e)) + print(f"Task {task_id} error: {e}") continue # apply our filters, to have only the setups that comply to the hyperparameters we want for filter_key, filter_value in parameter_filters.items(): @@ -156,19 +155,21 @@ Y=setups_evals[performance_column].to_numpy(), n_trees=n_trees, ) - for idx, pname in enumerate(parameter_names): + for idx, pname in enumerate(parameter_names): # noqa: PLW2901 try: fanova_results.append( { "hyperparameter": pname.split(".")[-1], - "fanova": evaluator.quantify_importance([idx])[(idx,)]["individual importance"], + "fanova": evaluator.quantify_importance([idx])[(idx,)][ + "individual importance" + ], } ) except RuntimeError as e: # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant # for all configurations (there is no variance). We will skip these tasks (like the authors did in the # paper). - print("Task %d error: %s" % (task_id, e)) + print(f"Task {task_id} error: {e}") continue # transform ``fanova_results`` from a list of dicts into a DataFrame diff --git a/examples/_external_or_deprecated/2018_neurips_perrone_example.py b/examples/_external_or_deprecated/2018_neurips_perrone_example.py index 0d72846ac..53f1fbe27 100644 --- a/examples/_external_or_deprecated/2018_neurips_perrone_example.py +++ b/examples/_external_or_deprecated/2018_neurips_perrone_example.py @@ -27,16 +27,17 @@ # License: BSD 3-Clause -import openml import numpy as np import pandas as pd from matplotlib import pyplot as plt -from sklearn.pipeline import Pipeline -from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer +from sklearn.ensemble import RandomForestRegressor +from sklearn.impute import SimpleImputer from sklearn.metrics import mean_squared_error +from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder -from sklearn.ensemble import RandomForestRegressor + +import openml flow_type = "svm" # this example will use the smaller svm flow evaluations ############################################################################ @@ -44,7 +45,7 @@ # a tabular format that can be used to build models. -def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_curve"): +def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_curve"): # noqa: FBT002 """ Fetch a list of evaluations based on the flows and tasks used in the experiments. @@ -101,7 +102,10 @@ def fetch_evaluations(run_full=False, flow_type="svm", metric="area_under_roc_cu def create_table_from_evaluations( - eval_df, flow_type="svm", run_count=np.iinfo(np.int64).max, task_ids=None + eval_df, + flow_type="svm", + run_count=np.iinfo(np.int64).max, # noqa: B008 + task_ids=None, ): """ Create a tabular data with its ground truth from a dataframe of evaluations. @@ -206,7 +210,7 @@ def list_categorical_attributes(flow_type="svm"): model.fit(X, y) y_pred = model.predict(X) -print("Training RMSE : {:.5}".format(mean_squared_error(y, y_pred))) +print(f"Training RMSE : {mean_squared_error(y, y_pred):.5}") ############################################################################# diff --git a/examples/_external_or_deprecated/benchmark_with_optunahub.py b/examples/_external_or_deprecated/benchmark_with_optunahub.py index ece3e7c40..38114bc44 100644 --- a/examples/_external_or_deprecated/benchmark_with_optunahub.py +++ b/examples/_external_or_deprecated/benchmark_with_optunahub.py @@ -100,7 +100,7 @@ def objective(trial: optuna.Trial) -> Pipeline: run.publish() logger.log(1, f"Run was uploaded to - {run.openml_url}") - except Exception as e: + except Exception as e: # noqa: BLE001 logger.log(1, f"Could not publish run - {e}") else: logger.log( diff --git a/examples/_external_or_deprecated/fetch_runtimes_tutorial.py b/examples/_external_or_deprecated/fetch_runtimes_tutorial.py index b2a3f1d2a..c8f85adc5 100644 --- a/examples/_external_or_deprecated/fetch_runtimes_tutorial.py +++ b/examples/_external_or_deprecated/fetch_runtimes_tutorial.py @@ -39,17 +39,16 @@ # # * (Case 5) Running models that do not release the Python Global Interpreter Lock (GIL) -import openml import numpy as np -from matplotlib import pyplot as plt from joblib.parallel import parallel_backend - -from sklearn.naive_bayes import GaussianNB -from sklearn.tree import DecisionTreeClassifier -from sklearn.neural_network import MLPClassifier +from matplotlib import pyplot as plt from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV, RandomizedSearchCV +from sklearn.naive_bayes import GaussianNB +from sklearn.neural_network import MLPClassifier +from sklearn.tree import DecisionTreeClassifier +import openml # %% [markdown] # # Preparing tasks and scikit-learn models @@ -63,12 +62,7 @@ # Viewing associated data n_repeats, n_folds, n_samples = task.get_split_dimensions() print( - "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( - task_id, - n_repeats, - n_folds, - n_samples, - ) + f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}." ) @@ -101,7 +95,7 @@ def print_compare_runtimes(measures): measures = run1.fold_evaluations print("The timing and performance metrics available: ") -for key in measures.keys(): +for key in measures: print(key) print() @@ -206,7 +200,6 @@ def print_compare_runtimes(measures): # included in the `wall_clock_time_millis_training` measure recorded. # %% -from sklearn.model_selection import GridSearchCV clf = RandomForestClassifier(n_estimators=10, n_jobs=2) @@ -284,22 +277,18 @@ def print_compare_runtimes(measures): # %% + def extract_refit_time(run, repeat, fold): - refit_time = ( + return ( run.fold_evaluations["wall_clock_time_millis"][repeat][fold] - run.fold_evaluations["wall_clock_time_millis_training"][repeat][fold] - run.fold_evaluations["wall_clock_time_millis_testing"][repeat][fold] ) - return refit_time for repeat in range(n_repeats): for fold in range(n_folds): - print( - "Repeat #{}-Fold #{}: {:.4f}".format( - repeat, fold, extract_refit_time(run4, repeat, fold) - ) - ) + print(f"Repeat #{repeat}-Fold #{fold}: {extract_refit_time(run4, repeat, fold):.4f}") # %% [markdown] # Along with the GridSearchCV already used above, we demonstrate how such diff --git a/examples/_external_or_deprecated/flow_id_tutorial.py b/examples/_external_or_deprecated/flow_id_tutorial.py index e813655fc..19190cf0b 100644 --- a/examples/_external_or_deprecated/flow_id_tutorial.py +++ b/examples/_external_or_deprecated/flow_id_tutorial.py @@ -9,7 +9,6 @@ import openml - # %% [markdown] # .. warning:: # .. include:: ../../test_server_usage_warning.txt @@ -48,7 +47,7 @@ # %% [markdown] # ## 2. Obtaining a flow given its name # The schema of a flow is given in XSD ( -# [here](https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.implementation.upload.xsd)). # noqa E501 +# [here](https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.implementation.upload.xsd)). # Only two fields are required, a unique name, and an external version. While it should be pretty # obvious why we need a name, the need for the additional external version information might not # be immediately clear. However, this information is very important as it allows to have multiple diff --git a/examples/_external_or_deprecated/flows_and_runs_tutorial.py b/examples/_external_or_deprecated/flows_and_runs_tutorial.py index 2d1bcb864..71d6960bd 100644 --- a/examples/_external_or_deprecated/flows_and_runs_tutorial.py +++ b/examples/_external_or_deprecated/flows_and_runs_tutorial.py @@ -3,8 +3,7 @@ # This tutorial covers how to train/run a model and how to upload the results. # %% -import openml -from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree +from sklearn import compose, ensemble, impute, neighbors, pipeline, preprocessing, tree import openml diff --git a/examples/_external_or_deprecated/plot_svm_hyperparameters_tutorial.py b/examples/_external_or_deprecated/plot_svm_hyperparameters_tutorial.py index faced588b..7bb72db5a 100644 --- a/examples/_external_or_deprecated/plot_svm_hyperparameters_tutorial.py +++ b/examples/_external_or_deprecated/plot_svm_hyperparameters_tutorial.py @@ -2,9 +2,10 @@ # # Plotting hyperparameter surfaces # %% -import openml import numpy as np +import openml + # %% [markdown] # # First step - obtaining the data # First, we need to choose an SVM flow, for example 8353, and a task. Finding the IDs of them are diff --git a/examples/_external_or_deprecated/run_setup_tutorial.py b/examples/_external_or_deprecated/run_setup_tutorial.py index 55d25d291..25591bb58 100644 --- a/examples/_external_or_deprecated/run_setup_tutorial.py +++ b/examples/_external_or_deprecated/run_setup_tutorial.py @@ -23,15 +23,15 @@ # %% import numpy as np -import openml -from openml.extensions.sklearn import cat, cont - -from sklearn.pipeline import make_pipeline, Pipeline from sklearn.compose import ColumnTransformer -from sklearn.impute import SimpleImputer -from sklearn.preprocessing import OneHotEncoder, FunctionTransformer -from sklearn.ensemble import RandomForestClassifier from sklearn.decomposition import TruncatedSVD +from sklearn.ensemble import RandomForestClassifier +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.preprocessing import OneHotEncoder + +import openml +from openml.extensions.sklearn import cat, cont # %% [markdown] # .. warning:: diff --git a/examples/_external_or_deprecated/upload_amlb_flows_and_runs.py b/examples/_external_or_deprecated/upload_amlb_flows_and_runs.py index 15ec0e1fb..53472fb06 100644 --- a/examples/_external_or_deprecated/upload_amlb_flows_and_runs.py +++ b/examples/_external_or_deprecated/upload_amlb_flows_and_runs.py @@ -14,10 +14,10 @@ # %% from collections import OrderedDict + import numpy as np import openml -from openml import OpenMLClassificationTask from openml.runs.functions import format_prediction # %% [markdown] @@ -43,17 +43,17 @@ # version of the package/script is used. Use tags so users can find your flow easily. # %% -general = dict( - name="automlbenchmark_autosklearn", - description=( +general = { + "name": "automlbenchmark_autosklearn", + "description": ( "Auto-sklearn as set up by the AutoML Benchmark" "Source: https://github.com/openml/automlbenchmark/releases/tag/v0.9" ), - external_version="amlb==0.9", - language="English", - tags=["amlb", "benchmark", "study_218"], - dependencies="amlb==0.9", -) + "external_version": "amlb==0.9", + "language": "English", + "tags": ["amlb", "benchmark", "study_218"], + "dependencies": "amlb==0.9", +} # %% [markdown] # Next we define the flow hyperparameters. We define their name and default value in `parameters`, @@ -62,14 +62,14 @@ # The use of ordered dicts is required. # %% -flow_hyperparameters = dict( - parameters=OrderedDict(time="240", memory="32", cores="8"), - parameters_meta_info=OrderedDict( +flow_hyperparameters = { + "parameters": OrderedDict(time="240", memory="32", cores="8"), + "parameters_meta_info": OrderedDict( cores=OrderedDict(description="number of available cores", data_type="int"), memory=OrderedDict(description="memory in gigabytes", data_type="int"), time=OrderedDict(description="time in minutes", data_type="int"), ), -) +} # %% [markdown] # It is possible to build a flow which uses other flows. @@ -89,11 +89,11 @@ # %% autosklearn_flow = openml.flows.get_flow(9313) # auto-sklearn 0.5.1 -subflow = dict( - components=OrderedDict(automl_tool=autosklearn_flow), +subflow = { + "components": OrderedDict(automl_tool=autosklearn_flow), # If you do not want to reference a subflow, you can use the following: # components=OrderedDict(), -) +} # %% [markdown] # With all parameters of the flow defined, we can now initialize the OpenMLFlow and publish. @@ -172,7 +172,7 @@ ] # random class probabilities (Iris has 150 samples and 3 classes): -r = np.random.rand(150 * n_repeats, 3) +r = np.random.rand(150 * n_repeats, 3) # noqa: NPY002 # scale the random values so that the probabilities of each sample sum to 1: y_proba = r / r.sum(axis=1).reshape(-1, 1) y_pred = y_proba.argmax(axis=1) @@ -194,7 +194,7 @@ index=index, prediction=class_map[yp], truth=y, - proba={c: pb for (c, pb) in zip(task.class_labels, proba)}, + proba=dict(zip(task.class_labels, proba)), ) predictions.append(prediction) @@ -203,7 +203,7 @@ # We use the argument setup_string because the used flow was a script. # %% -benchmark_command = f"python3 runbenchmark.py auto-sklearn medium -m aws -t 119" +benchmark_command = "python3 runbenchmark.py auto-sklearn medium -m aws -t 119" my_run = openml.runs.OpenMLRun( task_id=task_id, flow_id=flow_id, diff --git a/openml/__init__.py b/openml/__init__.py index c49505eb9..ae5db261f 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -91,33 +91,33 @@ def populate_cache( __all__ = [ - "OpenMLDataset", + "OpenMLBenchmarkSuite", + "OpenMLClassificationTask", + "OpenMLClusteringTask", "OpenMLDataFeature", - "OpenMLRun", - "OpenMLSplit", + "OpenMLDataset", "OpenMLEvaluation", - "OpenMLSetup", - "OpenMLParameter", - "OpenMLTask", - "OpenMLSupervisedTask", - "OpenMLClusteringTask", + "OpenMLFlow", "OpenMLLearningCurveTask", + "OpenMLParameter", "OpenMLRegressionTask", - "OpenMLClassificationTask", - "OpenMLFlow", + "OpenMLRun", + "OpenMLSetup", + "OpenMLSplit", "OpenMLStudy", - "OpenMLBenchmarkSuite", + "OpenMLSupervisedTask", + "OpenMLTask", + "__version__", + "_api_calls", + "config", "datasets", "evaluations", "exceptions", "extensions", - "config", - "runs", "flows", - "tasks", + "runs", "setups", "study", + "tasks", "utils", - "_api_calls", - "__version__", ] diff --git a/openml/_api_calls.py b/openml/_api_calls.py index 81296b3da..4c0083394 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -133,7 +133,7 @@ def _perform_api_call( def _download_minio_file( source: str, destination: str | Path, - exists_ok: bool = True, # noqa: FBT001, FBT002 + exists_ok: bool = True, # noqa: FBT002 proxy: str | None = "auto", ) -> None: """Download file ``source`` from a MinIO Bucket and store it at ``destination``. @@ -239,7 +239,7 @@ def _download_text_file( source: str, output_path: str | Path | None = None, md5_checksum: str | None = None, - exists_ok: bool = True, # noqa: FBT001, FBT002 + exists_ok: bool = True, # noqa: FBT002 encoding: str = "utf8", ) -> str | None: """Download the text file at `source` and store it in `output_path`. diff --git a/openml/config.py b/openml/config.py index cf66a6346..10379789a 100644 --- a/openml/config.py +++ b/openml/config.py @@ -37,7 +37,7 @@ class _Config(TypedDict): show_progress: bool -def _create_log_handlers(create_file_handler: bool = True) -> None: # noqa: FBT001, FBT002 +def _create_log_handlers(create_file_handler: bool = True) -> None: # noqa: FBT002 """Creates but does not attach the log handlers.""" global console_handler, file_handler # noqa: PLW0603 if console_handler is not None or file_handler is not None: @@ -172,7 +172,7 @@ def get_server_base_url() -> str: ------- str """ - domain, path = server.split("/api", maxsplit=1) + domain, _path = server.split("/api", maxsplit=1) return domain.replace("api", "www") @@ -257,8 +257,8 @@ def stop_using_configuration_for_example(cls) -> None: global server # noqa: PLW0603 global apikey # noqa: PLW0603 - server = cast(str, cls._last_used_server) - apikey = cast(str, cls._last_used_key) + server = cast("str", cls._last_used_server) + apikey = cast("str", cls._last_used_key) cls._start_last_called = False @@ -515,10 +515,10 @@ def overwrite_config_context(config: dict[str, Any]) -> Iterator[_Config]: __all__ = [ "get_cache_directory", + "get_config_as_dict", "set_root_cache_directory", "start_using_configuration_for_example", "stop_using_configuration_for_example", - "get_config_as_dict", ] _setup() diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py index 480dd9576..eb0932652 100644 --- a/openml/datasets/__init__.py +++ b/openml/datasets/__init__.py @@ -17,17 +17,17 @@ ) __all__ = [ + "OpenMLDataFeature", + "OpenMLDataset", "attributes_arff_from_df", "check_datasets_active", "create_dataset", + "delete_dataset", + "edit_dataset", + "fork_dataset", "get_dataset", "get_datasets", "list_datasets", - "OpenMLDataset", - "OpenMLDataFeature", - "status_update", "list_qualities", - "edit_dataset", - "fork_dataset", - "delete_dataset", + "status_update", ] diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py index 218b0066d..f753c1f7c 100644 --- a/openml/datasets/data_feature.py +++ b/openml/datasets/data_feature.py @@ -7,7 +7,7 @@ from IPython.lib import pretty -class OpenMLDataFeature: +class OpenMLDataFeature: # noqa: PLW1641 """ Data Feature (a.k.a. Attribute) object. @@ -51,8 +51,7 @@ def __init__( # noqa: PLR0913 if data_type == "nominal": if nominal_values is None: raise TypeError( - "Dataset features require attribute `nominal_values` for nominal " - "feature type.", + "Dataset features require attribute `nominal_values` for nominal feature type.", ) if not isinstance(nominal_values, list): @@ -75,10 +74,10 @@ def __init__( # noqa: PLR0913 self.ontologies = ontologies def __repr__(self) -> str: - return "[%d - %s (%s)]" % (self.index, self.name, self.data_type) + return f"[{self.index} - {self.name} ({self.data_type})]" def __eq__(self, other: Any) -> bool: return isinstance(other, OpenMLDataFeature) and self.__dict__ == other.__dict__ - def _repr_pretty_(self, pp: pretty.PrettyPrinter, cycle: bool) -> None: # noqa: FBT001, ARG002 + def _repr_pretty_(self, pp: pretty.PrettyPrinter, cycle: bool) -> None: # noqa: ARG002 pp.text(str(self)) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index fa83d2b8a..e3ba166d9 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -41,7 +41,7 @@ def _ensure_dataframe( raise TypeError(f"Data type {type(data)} not supported.") -class OpenMLDataset(OpenMLBase): +class OpenMLDataset(OpenMLBase): # noqa: PLW1641 """Dataset object. Allows fetching and uploading datasets to OpenML. @@ -719,8 +719,8 @@ def valid_category(cat: Any) -> bool: def get_data( # noqa: C901 self, target: list[str] | str | None = None, - include_row_id: bool = False, # noqa: FBT001, FBT002 - include_ignore_attribute: bool = False, # noqa: FBT001, FBT002 + include_row_id: bool = False, # noqa: FBT002 + include_ignore_attribute: bool = False, # noqa: FBT002 ) -> tuple[pd.DataFrame, pd.Series | None, list[bool], list[str]]: """Returns dataset content as dataframes. @@ -863,8 +863,8 @@ def get_features_by_type( # noqa: C901 self, data_type: str, exclude: list[str] | None = None, - exclude_ignore_attribute: bool = True, # noqa: FBT002, FBT001 - exclude_row_id_attribute: bool = True, # noqa: FBT002, FBT001 + exclude_ignore_attribute: bool = True, # noqa: FBT002 + exclude_row_id_attribute: bool = True, # noqa: FBT002 ) -> list[int]: """ Return indices of features of a given type, e.g. all nominal features. diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index ac5466a44..75f5aec67 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -259,7 +259,7 @@ def _validated_data_attributes( def check_datasets_active( dataset_ids: list[int], - raise_error_if_not_exist: bool = True, # noqa: FBT001, FBT002 + raise_error_if_not_exist: bool = True, # noqa: FBT002 ) -> dict[int, bool]: """ Check if the dataset ids provided are active. @@ -293,7 +293,7 @@ def check_datasets_active( def _name_to_id( dataset_name: str, version: int | None = None, - error_if_multiple: bool = False, # noqa: FBT001, FBT002 + error_if_multiple: bool = False, # noqa: FBT002 ) -> int: """Attempt to find the dataset id of the dataset with the given name. @@ -341,8 +341,8 @@ def _name_to_id( def get_datasets( dataset_ids: list[str | int], - download_data: bool = False, # noqa: FBT001, FBT002 - download_qualities: bool = False, # noqa: FBT001, FBT002 + download_data: bool = False, # noqa: FBT002 + download_qualities: bool = False, # noqa: FBT002 ) -> list[OpenMLDataset]: """Download datasets. @@ -377,14 +377,14 @@ def get_datasets( @openml.utils.thread_safe_if_oslo_installed def get_dataset( # noqa: C901, PLR0912 dataset_id: int | str, - download_data: bool = False, # noqa: FBT002, FBT001 + download_data: bool = False, # noqa: FBT002 version: int | None = None, - error_if_multiple: bool = False, # noqa: FBT002, FBT001 + error_if_multiple: bool = False, # noqa: FBT002 cache_format: Literal["pickle", "feather"] = "pickle", - download_qualities: bool = False, # noqa: FBT002, FBT001 - download_features_meta_data: bool = False, # noqa: FBT002, FBT001 - download_all_files: bool = False, # noqa: FBT002, FBT001 - force_refresh_cache: bool = False, # noqa: FBT001, FBT002 + download_qualities: bool = False, # noqa: FBT002 + download_features_meta_data: bool = False, # noqa: FBT002 + download_all_files: bool = False, # noqa: FBT002 + force_refresh_cache: bool = False, # noqa: FBT002 ) -> OpenMLDataset: """Download the OpenML dataset representation, optionally also download actual data file. @@ -1116,7 +1116,7 @@ def _get_dataset_description(did_cache_dir: Path, dataset_id: int) -> dict[str, def _get_dataset_parquet( description: dict | OpenMLDataset, cache_directory: Path | None = None, - download_all_files: bool = False, # noqa: FBT001, FBT002 + download_all_files: bool = False, # noqa: FBT002 ) -> Path | None: """Return the path to the local parquet file of the dataset. If is not cached, it is downloaded. @@ -1418,7 +1418,7 @@ def _get_online_dataset_arff(dataset_id: int) -> str | None: str or None A string representation of an ARFF file. Or None if file already exists. """ - dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id, "get") + dataset_xml = openml._api_calls._perform_api_call(f"data/{dataset_id}", "get") # build a dict from the xml. # use the url from the dataset description and return the ARFF string return openml._api_calls._download_text_file( @@ -1439,7 +1439,7 @@ def _get_online_dataset_format(dataset_id: int) -> str: str Dataset format. """ - dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id, "get") + dataset_xml = openml._api_calls._perform_api_call(f"data/{dataset_id}", "get") # build a dict from the xml and get the format from the dataset description return xmltodict.parse(dataset_xml)["oml:data_set_description"]["oml:format"].lower() # type: ignore diff --git a/openml/evaluations/__init__.py b/openml/evaluations/__init__.py index dbff47037..b56d0c2d5 100644 --- a/openml/evaluations/__init__.py +++ b/openml/evaluations/__init__.py @@ -5,7 +5,7 @@ __all__ = [ "OpenMLEvaluation", - "list_evaluations", "list_evaluation_measures", + "list_evaluations", "list_evaluations_setups", ] diff --git a/openml/evaluations/evaluation.py b/openml/evaluations/evaluation.py index 6d69d377e..8c54e686e 100644 --- a/openml/evaluations/evaluation.py +++ b/openml/evaluations/evaluation.py @@ -123,7 +123,7 @@ def __repr__(self) -> str: "Run ID", "OpenML Run URL", "Task ID", - "OpenML Task URL" "Flow ID", + "OpenML Task URLFlow ID", "OpenML Flow URL", "Setup ID", "Data ID", diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py index 7747294d7..c6bad9c91 100644 --- a/openml/evaluations/functions.py +++ b/openml/evaluations/functions.py @@ -228,7 +228,7 @@ def __list_evaluations(api_call: str) -> list[OpenMLEvaluation]: # Minimalistic check if the XML is useful if "oml:evaluations" not in evals_dict: raise ValueError( - "Error in return XML, does not contain " f'"oml:evaluations": {evals_dict!s}', + f'Error in return XML, does not contain "oml:evaluations": {evals_dict!s}', ) assert isinstance(evals_dict["oml:evaluations"]["oml:evaluation"], list), type( @@ -339,7 +339,7 @@ def list_evaluations_setups( tag: str | None = None, per_fold: bool | None = None, sort_order: str | None = None, - parameters_in_separate_columns: bool = False, # noqa: FBT001, FBT002 + parameters_in_separate_columns: bool = False, # noqa: FBT002 ) -> pd.DataFrame: """List all run-evaluation pairs matching all of the given filters and their hyperparameter settings. diff --git a/openml/extensions/__init__.py b/openml/extensions/__init__.py index b49865e0e..40d3443d3 100644 --- a/openml/extensions/__init__.py +++ b/openml/extensions/__init__.py @@ -10,7 +10,7 @@ __all__ = [ "Extension", - "register_extension", - "get_extension_by_model", "get_extension_by_flow", + "get_extension_by_model", + "register_extension", ] diff --git a/openml/extensions/extension_interface.py b/openml/extensions/extension_interface.py index 2a336eb52..e391d109a 100644 --- a/openml/extensions/extension_interface.py +++ b/openml/extensions/extension_interface.py @@ -63,8 +63,8 @@ def can_handle_model(cls, model: Any) -> bool: def flow_to_model( self, flow: OpenMLFlow, - initialize_with_defaults: bool = False, # noqa: FBT001, FBT002 - strict_version: bool = True, # noqa: FBT002, FBT001 + initialize_with_defaults: bool = False, # noqa: FBT002 + strict_version: bool = True, # noqa: FBT002 ) -> Any: """Instantiate a model from the flow representation. diff --git a/openml/extensions/functions.py b/openml/extensions/functions.py index 7a944c997..b75f8294c 100644 --- a/openml/extensions/functions.py +++ b/openml/extensions/functions.py @@ -40,7 +40,7 @@ def register_extension(extension: type[Extension]) -> None: def get_extension_by_flow( flow: OpenMLFlow, - raise_if_no_extension: bool = False, # noqa: FBT001, FBT002 + raise_if_no_extension: bool = False, # noqa: FBT002 ) -> Extension | None: """Get an extension which can handle the given flow. @@ -85,7 +85,7 @@ def get_extension_by_flow( def get_extension_by_model( model: Any, - raise_if_no_extension: bool = False, # noqa: FBT001, FBT002 + raise_if_no_extension: bool = False, # noqa: FBT002 ) -> Extension | None: """Get an extension which can handle the given flow. diff --git a/openml/flows/__init__.py b/openml/flows/__init__.py index ce32fec7d..d455249de 100644 --- a/openml/flows/__init__.py +++ b/openml/flows/__init__.py @@ -12,10 +12,10 @@ __all__ = [ "OpenMLFlow", - "get_flow", - "list_flows", - "get_flow_id", - "flow_exists", "assert_flows_equal", "delete_flow", + "flow_exists", + "get_flow", + "get_flow_id", + "list_flows", ] diff --git a/openml/flows/flow.py b/openml/flows/flow.py index 02d24e78b..9d86cd682 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -169,7 +169,7 @@ def extension(self) -> Extension: """The extension of the flow (e.g., sklearn).""" if self._extension is None: self._extension = cast( - Extension, get_extension_by_flow(self, raise_if_no_extension=True) + "Extension", get_extension_by_flow(self, raise_if_no_extension=True) ) return self._extension @@ -408,7 +408,7 @@ def _parse_publish_response(self, xml_response: dict) -> None: """Parse the id from the xml_response and assign it to self.""" self.flow_id = int(xml_response["oml:upload_flow"]["oml:id"]) - def publish(self, raise_error_if_exists: bool = False) -> OpenMLFlow: # noqa: FBT001, FBT002 + def publish(self, raise_error_if_exists: bool = False) -> OpenMLFlow: # noqa: FBT002 """Publish this flow to OpenML server. Raises a PyOpenMLError if the flow exists on the server, but @@ -435,7 +435,7 @@ def publish(self, raise_error_if_exists: bool = False) -> OpenMLFlow: # noqa: F if not flow_id: if self.flow_id: raise openml.exceptions.PyOpenMLError( - "Flow does not exist on the server, " "but 'flow.flow_id' is not None.", + "Flow does not exist on the server, but 'flow.flow_id' is not None.", ) super().publish() assert self.flow_id is not None # for mypy @@ -445,7 +445,7 @@ def publish(self, raise_error_if_exists: bool = False) -> OpenMLFlow: # noqa: F raise openml.exceptions.PyOpenMLError(error_message) elif self.flow_id is not None and self.flow_id != flow_id: raise openml.exceptions.PyOpenMLError( - "Local flow_id does not match server flow_id: " f"'{self.flow_id}' vs '{flow_id}'", + f"Local flow_id does not match server flow_id: '{self.flow_id}' vs '{flow_id}'", ) flow = openml.flows.functions.get_flow(flow_id) @@ -517,7 +517,7 @@ def get_subflow(self, structure: list[str]) -> OpenMLFlow: sub_identifier = structure[0] if sub_identifier not in self.components: raise ValueError( - f"Flow {self.name} does not contain component with " f"identifier {sub_identifier}", + f"Flow {self.name} does not contain component with identifier {sub_identifier}", ) if len(structure) == 1: return self.components[sub_identifier] # type: ignore diff --git a/openml/flows/functions.py b/openml/flows/functions.py index 9906958e5..6b2f0ba04 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -31,7 +31,7 @@ def _get_cached_flows() -> OrderedDict: flows = OrderedDict() # type: 'OrderedDict[int, OpenMLFlow]' flow_cache_dir = openml.utils._create_cache_directory(FLOWS_CACHE_DIR_NAME) - directory_content = os.listdir(flow_cache_dir) + directory_content = os.listdir(flow_cache_dir) # noqa: PTH208 directory_content.sort() # Find all flow ids for which we have downloaded # the flow description @@ -66,11 +66,11 @@ def _get_cached_flow(fid: int) -> OpenMLFlow: return _create_flow_from_xml(fh.read()) except OSError as e: openml.utils._remove_cache_dir_for_id(FLOWS_CACHE_DIR_NAME, fid_cache_dir) - raise OpenMLCacheException("Flow file for fid %d not cached" % fid) from e + raise OpenMLCacheException(f"Flow file for fid {fid} not cached") from e @openml.utils.thread_safe_if_oslo_installed -def get_flow(flow_id: int, reinstantiate: bool = False, strict_version: bool = True) -> OpenMLFlow: # noqa: FBT001, FBT002 +def get_flow(flow_id: int, reinstantiate: bool = False, strict_version: bool = True) -> OpenMLFlow: # noqa: FBT002 """Download the OpenML flow for a given flow ID. Parameters @@ -124,7 +124,7 @@ def _get_flow_description(flow_id: int) -> OpenMLFlow: xml_file = ( openml.utils._create_cache_directory_for_id(FLOWS_CACHE_DIR_NAME, flow_id) / "flow.xml" ) - flow_xml = openml._api_calls._perform_api_call("flow/%d" % flow_id, request_method="get") + flow_xml = openml._api_calls._perform_api_call(f"flow/{flow_id}", request_method="get") with xml_file.open("w", encoding="utf8") as fh: fh.write(flow_xml) @@ -245,7 +245,7 @@ def flow_exists(name: str, external_version: str) -> int | bool: def get_flow_id( model: Any | None = None, name: str | None = None, - exact_version: bool = True, # noqa: FBT001, FBT002 + exact_version: bool = True, # noqa: FBT002 ) -> int | bool | list[int]: """Retrieves the flow id for a model or a flow name. @@ -364,9 +364,9 @@ def assert_flows_equal( # noqa: C901, PLR0912, PLR0913, PLR0915 flow1: OpenMLFlow, flow2: OpenMLFlow, ignore_parameter_values_on_older_children: str | None = None, - ignore_parameter_values: bool = False, # noqa: FBT001, FBT002 - ignore_custom_name_if_none: bool = False, # noqa: FBT001, FBT002 - check_description: bool = True, # noqa: FBT001, FBT002 + ignore_parameter_values: bool = False, # noqa: FBT002 + ignore_custom_name_if_none: bool = False, # noqa: FBT002 + check_description: bool = True, # noqa: FBT002 ) -> None: """Check equality of two flows. @@ -456,9 +456,9 @@ def assert_flows_equal( # noqa: C901, PLR0912, PLR0913, PLR0915 ) if ignore_parameter_values_on_older_children: - assert ( - flow1.upload_date is not None - ), "Flow1 has no upload date that allows us to compare age of children." + assert flow1.upload_date is not None, ( + "Flow1 has no upload date that allows us to compare age of children." + ) upload_date_current_flow = dateutil.parser.parse(flow1.upload_date) upload_date_parent_flow = dateutil.parser.parse( ignore_parameter_values_on_older_children, diff --git a/openml/runs/__init__.py b/openml/runs/__init__.py index 6d3dca504..2f068a2e6 100644 --- a/openml/runs/__init__.py +++ b/openml/runs/__init__.py @@ -19,14 +19,14 @@ "OpenMLRun", "OpenMLRunTrace", "OpenMLTraceIteration", - "run_model_on_task", - "run_flow_on_task", + "delete_run", "get_run", - "list_runs", - "get_runs", "get_run_trace", - "run_exists", + "get_runs", "initialize_model_from_run", "initialize_model_from_trace", - "delete_run", + "list_runs", + "run_exists", + "run_flow_on_task", + "run_model_on_task", ] diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 666b75c37..fc9d6c76b 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -62,9 +62,9 @@ def run_model_on_task( # noqa: PLR0913 avoid_duplicate_runs: bool | None = None, flow_tags: list[str] | None = None, seed: int | None = None, - add_local_measures: bool = True, # noqa: FBT001, FBT002 - upload_flow: bool = False, # noqa: FBT001, FBT002 - return_flow: bool = False, # noqa: FBT001, FBT002 + add_local_measures: bool = True, # noqa: FBT002 + upload_flow: bool = False, # noqa: FBT002 + return_flow: bool = False, # noqa: FBT002 n_jobs: int | None = None, ) -> OpenMLRun | tuple[OpenMLRun, OpenMLFlow]: """Run the model on the dataset defined by the task. @@ -181,8 +181,8 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913 avoid_duplicate_runs: bool | None = None, flow_tags: list[str] | None = None, seed: int | None = None, - add_local_measures: bool = True, # noqa: FBT001, FBT002 - upload_flow: bool = False, # noqa: FBT001, FBT002 + add_local_measures: bool = True, # noqa: FBT002 + upload_flow: bool = False, # noqa: FBT002 n_jobs: int | None = None, ) -> OpenMLRun: """Run the model provided by the flow on the dataset defined by task. @@ -353,7 +353,7 @@ def get_run_trace(run_id: int) -> OpenMLRunTrace: ------- openml.runs.OpenMLTrace """ - trace_xml = openml._api_calls._perform_api_call("run/trace/%d" % run_id, "get") + trace_xml = openml._api_calls._perform_api_call(f"run/trace/{run_id}", "get") return OpenMLRunTrace.trace_from_xml(trace_xml) @@ -798,7 +798,7 @@ def get_runs(run_ids: list[int]) -> list[OpenMLRun]: @openml.utils.thread_safe_if_oslo_installed -def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun: # noqa: FBT002, FBT001 +def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun: # noqa: FBT002 """Gets run corresponding to run_id. Parameters @@ -828,14 +828,14 @@ def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun: # noqa: FBT0 raise OpenMLCacheException(message="dummy") except OpenMLCacheException: - run_xml = openml._api_calls._perform_api_call("run/%d" % run_id, "get") + run_xml = openml._api_calls._perform_api_call(f"run/{run_id}", "get") with run_file.open("w", encoding="utf8") as fh: fh.write(run_xml) return _create_run_from_xml(run_xml) -def _create_run_from_xml(xml: str, from_server: bool = True) -> OpenMLRun: # noqa: PLR0915, PLR0912, C901, FBT001, FBT002 +def _create_run_from_xml(xml: str, from_server: bool = True) -> OpenMLRun: # noqa: PLR0915, PLR0912, C901, FBT002 """Create a run object from xml returned from server. Parameters @@ -977,7 +977,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): # type: ignore evaluations[key] = value if "description" not in files and from_server is True: - raise ValueError("No description file for run %d in run description XML" % run_id) + raise ValueError(f"No description file for run {run_id} in run description XML") if "predictions" not in files and from_server is True: task = openml.tasks.get_task(task_id) @@ -988,7 +988,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): # type: ignore # a run can consist without predictions. But for now let's keep it # Matthias: yes, it should stay as long as we do not really handle # this stuff - raise ValueError("No prediction files for run %d in run description XML" % run_id) + raise ValueError(f"No prediction files for run {run_id} in run description XML") tags = openml.utils.extract_xml_tags("oml:tag", run) @@ -1037,7 +1037,7 @@ def list_runs( # noqa: PLR0913 uploader: list | None = None, tag: str | None = None, study: int | None = None, - display_errors: bool = False, # noqa: FBT001, FBT002 + display_errors: bool = False, # noqa: FBT002 task_type: TaskType | int | None = None, ) -> pd.DataFrame: """ @@ -1171,7 +1171,7 @@ def _list_runs( # noqa: PLR0913, C901 if uploader is not None: api_call += f"/uploader/{','.join([str(int(i)) for i in uploader])}" if study is not None: - api_call += "/study/%d" % study + api_call += f"/study/{study}" if display_errors: api_call += "/show_errors/true" if tag is not None: diff --git a/openml/runs/run.py b/openml/runs/run.py index 945264131..6bc51b570 100644 --- a/openml/runs/run.py +++ b/openml/runs/run.py @@ -280,7 +280,7 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]: ] @classmethod - def from_filesystem(cls, directory: str | Path, expect_model: bool = True) -> OpenMLRun: # noqa: FBT001, FBT002 + def from_filesystem(cls, directory: str | Path, expect_model: bool = True) -> OpenMLRun: # noqa: FBT002 """ The inverse of the to_filesystem method. Instantiates an OpenMLRun object based on files stored on the file system. @@ -347,7 +347,7 @@ def from_filesystem(cls, directory: str | Path, expect_model: bool = True) -> Op def to_filesystem( self, directory: str | Path, - store_model: bool = True, # noqa: FBT001, FBT002 + store_model: bool = True, # noqa: FBT002 ) -> None: """ The inverse of the from_filesystem method. Serializes a run @@ -365,7 +365,7 @@ def to_filesystem( model. """ if self.data_content is None or self.model is None: - raise ValueError("Run should have been executed (and contain " "model / predictions)") + raise ValueError("Run should have been executed (and contain model / predictions)") directory = Path(directory) directory.mkdir(exist_ok=True, parents=True) @@ -517,7 +517,7 @@ def get_metric_fn(self, sklearn_fn: Callable, kwargs: dict | None = None) -> np. # TODO: make this a stream reader else: raise ValueError( - "Run should have been locally executed or " "contain outputfile reference.", + "Run should have been locally executed or contain outputfile reference.", ) # Need to know more about the task to compute scores correctly @@ -528,11 +528,11 @@ def get_metric_fn(self, sklearn_fn: Callable, kwargs: dict | None = None) -> np. task.task_type_id in [TaskType.SUPERVISED_CLASSIFICATION, TaskType.LEARNING_CURVE] and "correct" not in attribute_names ): - raise ValueError('Attribute "correct" should be set for ' "classification task runs") + raise ValueError('Attribute "correct" should be set for classification task runs') if task.task_type_id == TaskType.SUPERVISED_REGRESSION and "truth" not in attribute_names: - raise ValueError('Attribute "truth" should be set for ' "regression task runs") + raise ValueError('Attribute "truth" should be set for regression task runs') if task.task_type_id != TaskType.CLUSTERING and "prediction" not in attribute_names: - raise ValueError('Attribute "predict" should be set for ' "supervised task runs") + raise ValueError('Attribute "predict" should be set for supervised task runs') def _attribute_list_to_dict(attribute_list): # type: ignore # convenience function: Creates a mapping to map from the name of @@ -566,7 +566,7 @@ def _attribute_list_to_dict(attribute_list): # type: ignore pred = predictions_arff["attributes"][predicted_idx][1] corr = predictions_arff["attributes"][correct_idx][1] raise ValueError( - "Predicted and Correct do not have equal values:" f" {pred!s} Vs. {corr!s}", + f"Predicted and Correct do not have equal values: {pred!s} Vs. {corr!s}", ) # TODO: these could be cached @@ -602,7 +602,7 @@ def _attribute_list_to_dict(attribute_list): # type: ignore values_correct[rep][fold][samp].append(correct) scores = [] - for rep in values_predict: + for rep in values_predict: # noqa: PLC0206 for fold in values_predict[rep]: last_sample = len(values_predict[rep][fold]) - 1 y_pred = values_predict[rep][fold][last_sample] diff --git a/openml/runs/trace.py b/openml/runs/trace.py index bc9e1b5d6..36247ec01 100644 --- a/openml/runs/trace.py +++ b/openml/runs/trace.py @@ -149,9 +149,7 @@ def get_selected_iteration(self, fold: int, repeat: int) -> int: for r, f, i in self.trace_iterations: if r == repeat and f == fold and self.trace_iterations[(r, f, i)].selected is True: return i - raise ValueError( - "Could not find the selected iteration for rep/fold %d/%d" % (repeat, fold), - ) + raise ValueError(f"Could not find the selected iteration for rep/fold {repeat}/{fold}") @classmethod def generate( @@ -185,8 +183,7 @@ def generate( raise ValueError("Trace content is empty.") if len(attributes) != len(content[0]): raise ValueError( - "Trace_attributes and trace_content not compatible:" - f" {attributes} vs {content[0]}", + f"Trace_attributes and trace_content not compatible: {attributes} vs {content[0]}", ) return cls._trace_from_arff_struct( diff --git a/openml/setups/__init__.py b/openml/setups/__init__.py index dd38cb9b7..fa4072059 100644 --- a/openml/setups/__init__.py +++ b/openml/setups/__init__.py @@ -4,10 +4,10 @@ from .setup import OpenMLParameter, OpenMLSetup __all__ = [ - "OpenMLSetup", "OpenMLParameter", + "OpenMLSetup", "get_setup", + "initialize_model", "list_setups", "setup_exists", - "initialize_model", ] diff --git a/openml/setups/functions.py b/openml/setups/functions.py index 374911901..fe72d0160 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -94,7 +94,7 @@ def _get_cached_setup(setup_id: int) -> OpenMLSetup: except OSError as e: raise openml.exceptions.OpenMLCacheException( - "Setup file for setup id %d not cached" % setup_id, + f"Setup file for setup id {setup_id} not cached", ) from e diff --git a/openml/study/__init__.py b/openml/study/__init__.py index b7d77fec4..37a6d376a 100644 --- a/openml/study/__init__.py +++ b/openml/study/__init__.py @@ -19,8 +19,8 @@ from .study import OpenMLBenchmarkSuite, OpenMLStudy __all__ = [ - "OpenMLStudy", "OpenMLBenchmarkSuite", + "OpenMLStudy", "attach_to_study", "attach_to_suite", "create_benchmark_suite", @@ -33,6 +33,6 @@ "get_suite", "list_studies", "list_suites", - "update_suite_status", "update_study_status", + "update_suite_status", ] diff --git a/openml/study/functions.py b/openml/study/functions.py index 4e16879d7..bb24ddcff 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -1,5 +1,4 @@ # License: BSD 3-Clause -# ruff: noqa: PLR0913 from __future__ import annotations import warnings @@ -422,7 +421,7 @@ def detach_from_study(study_id: int, run_ids: list[int]) -> int: new size of the study (in terms of explicitly linked entities) """ # Interestingly, there's no need to tell the server about the entity type, it knows by itself - uri = "study/%d/detach" % study_id + uri = f"study/{study_id}/detach" post_variables = {"ids": ",".join(str(x) for x in run_ids)} # type: openml._api_calls.DATA_TYPE result_xml = openml._api_calls._perform_api_call( call=uri, diff --git a/openml/tasks/__init__.py b/openml/tasks/__init__.py index f6df3a8d4..34c994e3a 100644 --- a/openml/tasks/__init__.py +++ b/openml/tasks/__init__.py @@ -19,17 +19,17 @@ ) __all__ = [ - "OpenMLTask", - "OpenMLSupervisedTask", - "OpenMLClusteringTask", - "OpenMLRegressionTask", "OpenMLClassificationTask", + "OpenMLClusteringTask", "OpenMLLearningCurveTask", + "OpenMLRegressionTask", + "OpenMLSplit", + "OpenMLSupervisedTask", + "OpenMLTask", + "TaskType", "create_task", + "delete_task", "get_task", "get_tasks", "list_tasks", - "OpenMLSplit", - "TaskType", - "delete_task", ] diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index d2bf5e946..fc046863a 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -38,7 +38,7 @@ def _get_cached_tasks() -> dict[int, OpenMLTask]: OpenMLTask. """ task_cache_dir = openml.utils._create_cache_directory(TASKS_CACHE_DIR_NAME) - directory_content = os.listdir(task_cache_dir) + directory_content = os.listdir(task_cache_dir) # noqa: PTH208 directory_content.sort() # Find all dataset ids for which we have downloaded the dataset @@ -329,7 +329,7 @@ def __list_tasks(api_call: str) -> pd.DataFrame: # noqa: C901, PLR0912 except KeyError as e: if tid is not None: warnings.warn( - "Invalid xml for task %d: %s\nFrom %s" % (tid, e, task_), + f"Invalid xml for task {tid}: {e}\nFrom {task_}", RuntimeWarning, stacklevel=2, ) @@ -388,7 +388,7 @@ def get_tasks( @openml.utils.thread_safe_if_oslo_installed def get_task( task_id: int, - download_splits: bool = False, # noqa: FBT001, FBT002 + download_splits: bool = False, # noqa: FBT002 **get_dataset_kwargs: Any, ) -> OpenMLTask: """Download OpenML task for a given task ID. @@ -442,7 +442,7 @@ def _get_task_description(task_id: int) -> OpenMLTask: except OpenMLCacheException: _cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id) xml_file = _cache_dir / "task.xml" - task_xml = openml._api_calls._perform_api_call("task/%d" % task_id, "get") + task_xml = openml._api_calls._perform_api_call(f"task/{task_id}", "get") with xml_file.open("w", encoding="utf8") as fh: fh.write(task_xml) diff --git a/openml/tasks/split.py b/openml/tasks/split.py index 4e781df35..464e41b2a 100644 --- a/openml/tasks/split.py +++ b/openml/tasks/split.py @@ -18,7 +18,7 @@ class Split(NamedTuple): test: np.ndarray -class OpenMLSplit: +class OpenMLSplit: # noqa: PLW1641 """OpenML Split object. This class manages train-test splits for a dataset across multiple diff --git a/openml/testing.py b/openml/testing.py index d1da16876..4f0b425a7 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -80,7 +80,7 @@ def setUp(self, n_levels: int = 1, tmpdir_suffix: str = "") -> None: for _ in range(n_levels): static_cache_dir = static_cache_dir.parent.absolute() - content = os.listdir(static_cache_dir) + content = os.listdir(static_cache_dir) # noqa: PTH208 if "files" in content: static_cache_dir = static_cache_dir / "files" else: @@ -352,9 +352,9 @@ def create_request_response( __all__ = [ - "TestBase", - "SimpleImputer", "CustomImputer", + "SimpleImputer", + "TestBase", "check_task_existence", "create_request_response", ] diff --git a/openml/utils.py b/openml/utils.py index 7e72e7aee..ce1584df2 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -103,7 +103,7 @@ def _get_rest_api_type_alias(oml_object: OpenMLBase) -> str: return api_type_alias -def _tag_openml_base(oml_object: OpenMLBase, tag: str, untag: bool = False) -> None: # noqa: FBT001, FBT002 +def _tag_openml_base(oml_object: OpenMLBase, tag: str, untag: bool = False) -> None: # noqa: FBT002 api_type_alias = _get_rest_api_type_alias(oml_object) if oml_object.id is None: raise openml.exceptions.ObjectNotPublishedError( @@ -198,7 +198,7 @@ def _delete_entity(entity_type: str, entity_id: int) -> bool: if entity_type not in legal_entities: raise ValueError(f"Can't delete a {entity_type}") - url_suffix = "%s/%d" % (entity_type, entity_id) + url_suffix = f"{entity_type}/{entity_id}" try: result_xml = openml._api_calls._perform_api_call(url_suffix, "delete") result = xmltodict.parse(result_xml) @@ -344,7 +344,7 @@ def _create_cache_directory(key: str) -> Path: return cache_dir -def _get_cache_dir_for_id(key: str, id_: int, create: bool = False) -> Path: # noqa: FBT001, FBT002 +def _get_cache_dir_for_id(key: str, id_: int, create: bool = False) -> Path: # noqa: FBT002 cache_dir = _create_cache_directory(key) if create else _get_cache_dir_for_key(key) return Path(cache_dir) / str(id_) diff --git a/pyproject.toml b/pyproject.toml index 2bf762b09..3483c860a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -275,9 +275,11 @@ ignore = [ "S101", # Use of assert detected. "W292", # No newline at end of file "PLC1901", # "" can be simplified to be falsey - "TCH003", # Move stdlib import into TYPE_CHECKING + "TC003", # Move stdlib import into TYPE_CHECKING "COM812", # Trailing comma missing (handled by linter, ruff recommend disabling if using formatter) "N803", # Argument should be lowercase (but we accept things like `X`) + "PLC0415", # Allow imports inside functions / non-top-level scope + "FBT001", # Allow Boolean-typed positional argument in function definition # TODO(@eddibergman): These should be enabled "D100", # Missing docstring in public module diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 000000000..000969b80 --- /dev/null +++ b/scripts/__init__.py @@ -0,0 +1 @@ +"""Package for scripts and utilities."""