From 1dcd35ef941fb6f76f0d82cf90330f445dd48531 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Mon, 14 Oct 2024 18:18:01 +0200
Subject: [PATCH 1/8] Specify full import
---
tests/routers/openml/migration/datasets_migration_test.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tests/routers/openml/migration/datasets_migration_test.py b/tests/routers/openml/migration/datasets_migration_test.py
index 3faca11..f09f86a 100644
--- a/tests/routers/openml/migration/datasets_migration_test.py
+++ b/tests/routers/openml/migration/datasets_migration_test.py
@@ -1,11 +1,11 @@
import json
from http import HTTPStatus
-import constants
import httpx
import pytest
from starlette.testclient import TestClient
+import tests.constants
from core.conversions import nested_remove_single_element_list
from tests.users import ApiKey
@@ -127,7 +127,7 @@ def test_private_dataset_owner_access(
php_api: TestClient,
api_key: str,
) -> None:
- [private_dataset] = constants.PRIVATE_DATASET_ID
+ [private_dataset] = tests.constants.PRIVATE_DATASET_ID
new_response = py_api.get(f"/datasets/{private_dataset}?api_key={api_key}")
old_response = php_api.get(f"/data/{private_dataset}?api_key={api_key}")
assert old_response.status_code == HTTPStatus.OK
From 48f8d8a74f4251c626eff8ce0e879cbd7787f662 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Mon, 14 Oct 2024 18:24:43 +0200
Subject: [PATCH 2/8] Check authentication is required for the dataset upload
---
tests/routers/openml/datasets_test.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py
index 7c1457f..82c641a 100644
--- a/tests/routers/openml/datasets_test.py
+++ b/tests/routers/openml/datasets_test.py
@@ -269,3 +269,8 @@ def test_dataset_status_unauthorized(
json={"dataset_id": dataset_id, "status": status},
)
assert response.status_code == HTTPStatus.FORBIDDEN
+
+
+def test_dataset_upload_needs_authentication(py_api: TestClient) -> None:
+ response = py_api.post("/datasets")
+ assert response.status_code == HTTPStatus.UNAUTHORIZED
From aef0804ee35ecfb4d0125006908654ed29970e1c Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Mon, 14 Oct 2024 18:50:16 +0200
Subject: [PATCH 3/8] Require authentication for dataset upload
---
src/routers/openml/datasets.py | 13 +++++++++++++
tests/routers/openml/datasets_test.py | 8 ++++----
2 files changed, 17 insertions(+), 4 deletions(-)
diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py
index dda2511..48ad0a0 100644
--- a/src/routers/openml/datasets.py
+++ b/src/routers/openml/datasets.py
@@ -370,6 +370,19 @@ def update_dataset_status(
return {"dataset_id": dataset_id, "status": status}
+@router.post(
+ path="",
+)
+def upload_data(
+ user: Annotated[User | None, Depends(fetch_user)] = None,
+) -> None:
+ if user is None:
+ raise HTTPException(
+ status_code=HTTPStatus.UNAUTHORIZED,
+ detail="You need to authenticate to upload a dataset.",
+ )
+
+
@router.get(
path="/{dataset_id}",
description="Get meta-data for dataset with ID `dataset_id`.",
diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py
index 82c641a..3f71b62 100644
--- a/tests/routers/openml/datasets_test.py
+++ b/tests/routers/openml/datasets_test.py
@@ -6,7 +6,7 @@
from starlette.testclient import TestClient
from database.users import User
-from routers.openml.datasets import get_dataset
+from routers.openml.datasets import get_dataset, upload_data
from schemas.datasets.openml import DatasetMetadata, DatasetStatus
from tests.users import ADMIN_USER, NO_USER, OWNER_USER, SOME_USER, ApiKey
@@ -271,6 +271,6 @@ def test_dataset_status_unauthorized(
assert response.status_code == HTTPStatus.FORBIDDEN
-def test_dataset_upload_needs_authentication(py_api: TestClient) -> None:
- response = py_api.post("/datasets")
- assert response.status_code == HTTPStatus.UNAUTHORIZED
+def test_dataset_upload_needs_authentication() -> None:
+ with pytest.raises(HTTPException, match="You need to authenticate to upload a dataset."):
+ upload_data(user=None)
From b774c3373353a59ecc02ca6508c168d538505b84 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Wed, 16 Oct 2024 13:04:07 +0200
Subject: [PATCH 4/8] Add python-multipart for file streaming uploads
---
pyproject.toml | 1 +
1 file changed, 1 insertion(+)
diff --git a/pyproject.toml b/pyproject.toml
index a7bff61..8960f1a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,6 +20,7 @@ dependencies = [
"mysqlclient",
"python_dotenv",
"xmltodict",
+ "python-multipart",
]
[project.optional-dependencies]
From cdd5066a3c89813ac0fce4909750f6574cdd7016 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Wed, 16 Oct 2024 17:27:13 +0200
Subject: [PATCH 5/8] Require a file to be present for upload
---
src/routers/openml/datasets.py | 15 +++++++++++----
tests/routers/openml/datasets_test.py | 24 +++++++++++++++++++++---
2 files changed, 32 insertions(+), 7 deletions(-)
diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py
index 48ad0a0..bbae577 100644
--- a/src/routers/openml/datasets.py
+++ b/src/routers/openml/datasets.py
@@ -2,9 +2,10 @@
from datetime import datetime
from enum import StrEnum
from http import HTTPStatus
+from pathlib import Path
from typing import Annotated, Any, Literal, NamedTuple
-from fastapi import APIRouter, Body, Depends, HTTPException
+from fastapi import APIRouter, Body, Depends, File, HTTPException, UploadFile
from sqlalchemy import Connection, text
from sqlalchemy.engine import Row
@@ -370,10 +371,9 @@ def update_dataset_status(
return {"dataset_id": dataset_id, "status": status}
-@router.post(
- path="",
-)
+@router.post(path="")
def upload_data(
+ file: Annotated[UploadFile, File(description="A pyarrow parquet file containing the data.")],
user: Annotated[User | None, Depends(fetch_user)] = None,
) -> None:
if user is None:
@@ -381,6 +381,13 @@ def upload_data(
status_code=HTTPStatus.UNAUTHORIZED,
detail="You need to authenticate to upload a dataset.",
)
+ # Spooled file-- where is it stored?
+ if file.filename is None or Path(file.filename).suffix != ".pq":
+ raise HTTPException(
+ status_code=HTTPStatus.IM_A_TEAPOT,
+ detail="The uploaded file needs to be a parquet file (.pq).",
+ )
+ # use async interface
@router.get(
diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py
index 3f71b62..ef4338b 100644
--- a/tests/routers/openml/datasets_test.py
+++ b/tests/routers/openml/datasets_test.py
@@ -1,7 +1,8 @@
from http import HTTPStatus
+from io import BytesIO
import pytest
-from fastapi import HTTPException
+from fastapi import HTTPException, UploadFile
from sqlalchemy import Connection
from starlette.testclient import TestClient
@@ -272,5 +273,22 @@ def test_dataset_status_unauthorized(
def test_dataset_upload_needs_authentication() -> None:
- with pytest.raises(HTTPException, match="You need to authenticate to upload a dataset."):
- upload_data(user=None)
+ with pytest.raises(HTTPException) as e:
+ upload_data(user=None, file=None) # type: ignore[arg-type]
+
+ assert e.value.status_code == HTTPStatus.UNAUTHORIZED
+ assert e.value.detail == "You need to authenticate to upload a dataset."
+
+
+@pytest.mark.parametrize(
+ "file_name", ["parquet.csv", pytest.param("parquet.pq", marks=pytest.mark.xfail)]
+)
+def test_dataset_upload_error_if_not_parquet(file_name: str) -> None:
+ # we do not expect the server to actually check the parquet content
+ file = UploadFile(filename=file_name, file=BytesIO(b""))
+
+ with pytest.raises(HTTPException) as e:
+ upload_data(file, SOME_USER)
+
+ assert e.value.status_code == HTTPStatus.IM_A_TEAPOT
+ assert e.value.detail == "The uploaded file needs to be a parquet file (.pq)."
From 13f4a0207a0c084b1b3aa0ba8857f6dfcaa7a4c2 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Wed, 16 Oct 2024 17:42:44 +0200
Subject: [PATCH 6/8] Separate server-provided data from user-provided metadata
---
src/routers/openml/datasets.py | 13 ++++++++++---
src/schemas/datasets/convertor.py | 4 ++--
src/schemas/datasets/openml.py | 10 ++++++----
tests/routers/openml/datasets_test.py | 12 ++++++++----
4 files changed, 26 insertions(+), 13 deletions(-)
diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py
index bbae577..9c18e95 100644
--- a/src/routers/openml/datasets.py
+++ b/src/routers/openml/datasets.py
@@ -22,7 +22,13 @@
from database.users import User, UserGroup
from routers.dependencies import Pagination, expdb_connection, fetch_user, userdb_connection
from routers.types import CasualString128, IntegerRange, SystemString64, integer_range_regex
-from schemas.datasets.openml import DatasetMetadata, DatasetStatus, Feature, FeatureType
+from schemas.datasets.openml import (
+ DatasetMetadata,
+ DatasetMetadataView,
+ DatasetStatus,
+ Feature,
+ FeatureType,
+)
router = APIRouter(prefix="/datasets", tags=["datasets"])
@@ -374,6 +380,7 @@ def update_dataset_status(
@router.post(path="")
def upload_data(
file: Annotated[UploadFile, File(description="A pyarrow parquet file containing the data.")],
+ metadata: DatasetMetadata, # noqa: ARG001
user: Annotated[User | None, Depends(fetch_user)] = None,
) -> None:
if user is None:
@@ -399,7 +406,7 @@ def get_dataset(
user: Annotated[User | None, Depends(fetch_user)] = None,
user_db: Annotated[Connection, Depends(userdb_connection)] = None,
expdb_db: Annotated[Connection, Depends(expdb_connection)] = None,
-) -> DatasetMetadata:
+) -> DatasetMetadataView:
dataset = _get_dataset_raise_otherwise(dataset_id, user, expdb_db)
if not (
dataset_file := database.datasets.get_file(file_id=dataset.file_id, connection=user_db)
@@ -431,7 +438,7 @@ def get_dataset(
original_data_url = _csv_as_list(dataset.original_data_url, unquote_items=True)
default_target_attribute = _csv_as_list(dataset.default_target_attribute, unquote_items=True)
- return DatasetMetadata(
+ return DatasetMetadataView(
id=dataset.did,
visibility=dataset.visibility,
status=status_,
diff --git a/src/schemas/datasets/convertor.py b/src/schemas/datasets/convertor.py
index 3c35cd2..7b4a305 100644
--- a/src/schemas/datasets/convertor.py
+++ b/src/schemas/datasets/convertor.py
@@ -6,10 +6,10 @@
SpdxChecksum,
XSDDateTime,
)
-from schemas.datasets.openml import DatasetMetadata
+from schemas.datasets.openml import DatasetMetadataView
-def openml_dataset_to_dcat(metadata: DatasetMetadata) -> DcatApWrapper:
+def openml_dataset_to_dcat(metadata: DatasetMetadataView) -> DcatApWrapper:
checksum = SpdxChecksum(
id_=metadata.md5_checksum,
algorithm="md5",
diff --git a/src/schemas/datasets/openml.py b/src/schemas/datasets/openml.py
index 8edb373..b22bdec 100644
--- a/src/schemas/datasets/openml.py
+++ b/src/schemas/datasets/openml.py
@@ -59,10 +59,6 @@ class EstimationProcedure(BaseModel):
class DatasetMetadata(BaseModel):
- id_: int = Field(json_schema_extra={"example": 1}, alias="id")
- visibility: Visibility = Field(json_schema_extra={"example": Visibility.PUBLIC})
- status: DatasetStatus = Field(json_schema_extra={"example": DatasetStatus.ACTIVE})
-
name: str = Field(json_schema_extra={"example": "Anneal"})
licence: str = Field(json_schema_extra={"example": "CC0"})
version: int = Field(json_schema_extra={"example": 2})
@@ -135,6 +131,12 @@ class DatasetMetadata(BaseModel):
md5_checksum: str = Field(json_schema_extra={"example": "d01f6ccd68c88b749b20bbe897de3713"})
+class DatasetMetadataView(DatasetMetadata):
+ id_: int = Field(json_schema_extra={"example": 1}, alias="id")
+ visibility: Visibility = Field(json_schema_extra={"example": Visibility.PUBLIC})
+ status: DatasetStatus = Field(json_schema_extra={"example": DatasetStatus.ACTIVE})
+
+
class Task(BaseModel):
id_: int = Field(serialization_alias="id", json_schema_extra={"example": 59})
name: str = Field(
diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py
index ef4338b..1c7d9a6 100644
--- a/tests/routers/openml/datasets_test.py
+++ b/tests/routers/openml/datasets_test.py
@@ -8,7 +8,7 @@
from database.users import User
from routers.openml.datasets import get_dataset, upload_data
-from schemas.datasets.openml import DatasetMetadata, DatasetStatus
+from schemas.datasets.openml import DatasetMetadataView, DatasetStatus
from tests.users import ADMIN_USER, NO_USER, OWNER_USER, SOME_USER, ApiKey
@@ -101,7 +101,7 @@ def test_private_dataset_access(user: User, expdb_test: Connection, user_test: C
user_db=user_test,
expdb_db=expdb_test,
)
- assert isinstance(dataset, DatasetMetadata)
+ assert isinstance(dataset, DatasetMetadataView)
def test_dataset_features(py_api: TestClient) -> None:
@@ -274,7 +274,7 @@ def test_dataset_status_unauthorized(
def test_dataset_upload_needs_authentication() -> None:
with pytest.raises(HTTPException) as e:
- upload_data(user=None, file=None) # type: ignore[arg-type]
+ upload_data(user=None, file=None, metadata=None) # type: ignore[arg-type]
assert e.value.status_code == HTTPStatus.UNAUTHORIZED
assert e.value.detail == "You need to authenticate to upload a dataset."
@@ -288,7 +288,11 @@ def test_dataset_upload_error_if_not_parquet(file_name: str) -> None:
file = UploadFile(filename=file_name, file=BytesIO(b""))
with pytest.raises(HTTPException) as e:
- upload_data(file, SOME_USER)
+ upload_data(file=file, user=SOME_USER, metadata=None) # type: ignore[arg-type]
assert e.value.status_code == HTTPStatus.IM_A_TEAPOT
assert e.value.detail == "The uploaded file needs to be a parquet file (.pq)."
+
+
+def test_dataset_upload() -> None:
+ pass
From 1e09958d44784cf402f4ae6965e9be828c171706 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Wed, 16 Oct 2024 17:43:32 +0200
Subject: [PATCH 7/8] Change the failure code to BAD_REQUEST
---
src/routers/openml/datasets.py | 2 +-
tests/routers/openml/datasets_test.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py
index 9c18e95..0b15475 100644
--- a/src/routers/openml/datasets.py
+++ b/src/routers/openml/datasets.py
@@ -391,7 +391,7 @@ def upload_data(
# Spooled file-- where is it stored?
if file.filename is None or Path(file.filename).suffix != ".pq":
raise HTTPException(
- status_code=HTTPStatus.IM_A_TEAPOT,
+ status_code=HTTPStatus.BAD_REQUEST,
detail="The uploaded file needs to be a parquet file (.pq).",
)
# use async interface
diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py
index 1c7d9a6..2bb2ede 100644
--- a/tests/routers/openml/datasets_test.py
+++ b/tests/routers/openml/datasets_test.py
@@ -290,7 +290,7 @@ def test_dataset_upload_error_if_not_parquet(file_name: str) -> None:
with pytest.raises(HTTPException) as e:
upload_data(file=file, user=SOME_USER, metadata=None) # type: ignore[arg-type]
- assert e.value.status_code == HTTPStatus.IM_A_TEAPOT
+ assert e.value.status_code == HTTPStatus.BAD_REQUEST
assert e.value.detail == "The uploaded file needs to be a parquet file (.pq)."
From de22e0f284ed6154045ce7ceaf80169a67d02238 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Wed, 16 Oct 2024 18:09:28 +0200
Subject: [PATCH 8/8] Move more data as server defined
Some of those aspects are not server defined but simply outside of
the dataset table and are not expected during upload
---
src/schemas/datasets/openml.py | 60 ++++++++++++++++++----------------
1 file changed, 32 insertions(+), 28 deletions(-)
diff --git a/src/schemas/datasets/openml.py b/src/schemas/datasets/openml.py
index b22bdec..eee3e84 100644
--- a/src/schemas/datasets/openml.py
+++ b/src/schemas/datasets/openml.py
@@ -62,13 +62,17 @@ class DatasetMetadata(BaseModel):
name: str = Field(json_schema_extra={"example": "Anneal"})
licence: str = Field(json_schema_extra={"example": "CC0"})
version: int = Field(json_schema_extra={"example": 2})
- version_label: str = Field(
+ version_label: str | None = Field(
json_schema_extra={
"example": 2,
"description": "Not sure how this relates to `version`.",
},
+ max_length=128,
+ )
+ language: str | None = Field(
+ json_schema_extra={"example": "English"},
+ max_length=128,
)
- language: str = Field(json_schema_extra={"example": "English"})
creators: list[str] = Field(
json_schema_extra={"example": ["David Sterling", "Wray Buntine"]},
@@ -78,7 +82,7 @@ class DatasetMetadata(BaseModel):
json_schema_extra={"example": ["David Sterling", "Wray Buntine"]},
alias="contributor",
)
- citation: str = Field(
+ citation: str | None = Field(
json_schema_extra={"example": "https://archive.ics.uci.edu/ml/citation_policy.html"},
)
paper_url: HttpUrl | None = Field(
@@ -86,6 +90,30 @@ class DatasetMetadata(BaseModel):
"example": "http://digital.library.adelaide.edu.au/dspace/handle/2440/15227",
},
)
+ collection_date: str | None = Field(json_schema_extra={"example": "1990"})
+
+ description: str = Field(
+ json_schema_extra={"example": "The original Annealing dataset from UCI."},
+ )
+ default_target_attribute: list[str] = Field(json_schema_extra={"example": "class"})
+ ignore_attribute: list[str] = Field(json_schema_extra={"example": "sensitive_feature"})
+ row_id_attribute: list[str] = Field(json_schema_extra={"example": "ssn"})
+
+ format_: DatasetFileFormat = Field(
+ json_schema_extra={"example": DatasetFileFormat.ARFF},
+ alias="format",
+ )
+ original_data_url: list[HttpUrl] | None = Field(
+ json_schema_extra={"example": "https://www.openml.org/d/2"},
+ )
+
+
+class DatasetMetadataView(DatasetMetadata):
+ id_: int = Field(json_schema_extra={"example": 1}, alias="id")
+ visibility: Visibility = Field(json_schema_extra={"example": Visibility.PUBLIC})
+ status: DatasetStatus = Field(json_schema_extra={"example": DatasetStatus.ACTIVE})
+ description_version: int = Field(json_schema_extra={"example": 2})
+ tags: list[str] = Field(json_schema_extra={"example": ["study_1", "uci"]}, alias="tag")
upload_date: datetime = Field(
json_schema_extra={"example": str(datetime(2014, 4, 6, 23, 12, 20))},
)
@@ -97,17 +125,7 @@ class DatasetMetadata(BaseModel):
alias="error",
)
processing_warning: str | None = Field(alias="warning")
- collection_date: str | None = Field(json_schema_extra={"example": "1990"})
-
- description: str = Field(
- json_schema_extra={"example": "The original Annealing dataset from UCI."},
- )
- description_version: int = Field(json_schema_extra={"example": 2})
- tags: list[str] = Field(json_schema_extra={"example": ["study_1", "uci"]}, alias="tag")
- default_target_attribute: list[str] = Field(json_schema_extra={"example": "class"})
- ignore_attribute: list[str] = Field(json_schema_extra={"example": "sensitive_feature"})
- row_id_attribute: list[str] = Field(json_schema_extra={"example": "ssn"})
-
+ file_id: int = Field(json_schema_extra={"example": 1})
url: HttpUrl = Field(
json_schema_extra={
"example": "https://www.openml.org/data/download/1/dataset_1_anneal.arff",
@@ -120,23 +138,9 @@ class DatasetMetadata(BaseModel):
"description": "URL of the parquet dataset data file.",
},
)
- file_id: int = Field(json_schema_extra={"example": 1})
- format_: DatasetFileFormat = Field(
- json_schema_extra={"example": DatasetFileFormat.ARFF},
- alias="format",
- )
- original_data_url: list[HttpUrl] | None = Field(
- json_schema_extra={"example": "https://www.openml.org/d/2"},
- )
md5_checksum: str = Field(json_schema_extra={"example": "d01f6ccd68c88b749b20bbe897de3713"})
-class DatasetMetadataView(DatasetMetadata):
- id_: int = Field(json_schema_extra={"example": 1}, alias="id")
- visibility: Visibility = Field(json_schema_extra={"example": Visibility.PUBLIC})
- status: DatasetStatus = Field(json_schema_extra={"example": DatasetStatus.ACTIVE})
-
-
class Task(BaseModel):
id_: int = Field(serialization_alias="id", json_schema_extra={"example": 59})
name: str = Field(