diff --git a/examples/Advanced/huggingface_integration_tutorial.py b/examples/Advanced/huggingface_integration_tutorial.py new file mode 100644 index 000000000..8ae801c5e --- /dev/null +++ b/examples/Advanced/huggingface_integration_tutorial.py @@ -0,0 +1,108 @@ +""" +HuggingFace Hub Integration Tutorial +===================================== + +This example demonstrates how to share OpenML flows with HuggingFace Hub, +enabling bidirectional model sharing between the two platforms. + +Prerequisites: +- huggingface_hub installed: pip install huggingface_hub +- HuggingFace account with API token +""" + +import openml +from openml.extensions.huggingface import ( + download_flow_from_huggingface, + upload_flow_to_huggingface, +) + +# %% +# Setup +# ----- +# Configure OpenML (you need an API key from openml.org) +openml.config.apikey = "YOUR_OPENML_API_KEY" + +# Your HuggingFace token (get from huggingface.co/settings/tokens) +HF_TOKEN = "YOUR_HUGGINGFACE_TOKEN" + +# %% +# Example 1: Upload an OpenML Flow to HuggingFace +# ------------------------------------------------ + +# Get a flow from OpenML (this example uses a RandomForest classifier) +flow_id = 8365 # sklearn RandomForestClassifier +flow = openml.flows.get_flow(flow_id, reinstantiate=True) + +print(f"Flow Name: {flow.name}") +print(f"Flow ID: {flow.flow_id}") + +# Upload to HuggingFace Hub +hf_url = upload_flow_to_huggingface( + flow=flow, + repo_id="your-username/openml-randomforest", # Change to your username + token=HF_TOKEN, + private=False, # Set to True for private repositories +) + +print(f"Model uploaded to: {hf_url}") + +# %% +# Example 2: Download a Model from HuggingFace +# --------------------------------------------- + +result = download_flow_from_huggingface( + repo_id="your-username/openml-randomforest", + token=HF_TOKEN, # Only needed for private repos +) + +# Access the model +model = result["model"] +metadata = result["metadata"] + +print(f"Downloaded model: {type(model)}") +print(f"Original OpenML Flow ID: {metadata['openml_flow_id']}") +print(f"OpenML URL: {metadata['openml_url']}") + +# %% +# Example 3: Share Your Own Model +# -------------------------------- +# Train a model, create a flow, publish to OpenML, then share on HuggingFace + +from sklearn.ensemble import RandomForestClassifier + +# Get a dataset +dataset = openml.datasets.get_dataset(31) # credit-g dataset +X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute) + +# Train a model +clf = RandomForestClassifier(n_estimators=10, random_state=42) +clf.fit(X, y) + +# Create and publish flow +flow = openml.flows.sklearn_to_flow(clf) +flow.publish() + +print(f"Published flow with ID: {flow.flow_id}") + +# Share on HuggingFace +hf_url = upload_flow_to_huggingface( + flow=flow, + repo_id="your-username/my-credit-model", + token=HF_TOKEN, + commit_message="Initial upload of credit scoring model", +) + +print(f"Shared on HuggingFace: {hf_url}") + +# %% +# Example 4: Using Configuration +# ------------------------------- +from openml.extensions.huggingface.config import get_config, set_cache_directory + +# Set custom cache directory +set_cache_directory("/path/to/custom/cache") + +# Check configuration +config = get_config() +print(f"Cache directory: {config.cache_dir}") +print(f"Model filename: {config.model_filename}") \ No newline at end of file diff --git a/openml/extensions/huggingface/__init__.py b/openml/extensions/huggingface/__init__.py new file mode 100644 index 000000000..9424e249c --- /dev/null +++ b/openml/extensions/huggingface/__init__.py @@ -0,0 +1,14 @@ +""" +HuggingFace Hub integration for OpenML. +Enables bidirectional model sharing between OpenML and HuggingFace Hub. +""" + +from .functions import ( + download_flow_from_huggingface, + upload_flow_to_huggingface, +) + +__all__ = [ + "download_flow_from_huggingface", + "upload_flow_to_huggingface", +] diff --git a/openml/extensions/huggingface/config.py b/openml/extensions/huggingface/config.py new file mode 100644 index 000000000..35defa903 --- /dev/null +++ b/openml/extensions/huggingface/config.py @@ -0,0 +1,80 @@ +"""Configuration for HuggingFace Hub integration.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + +import openml + + +@dataclass +class HuggingFaceConfig: + """Configuration for HuggingFace Hub integration. + + Attributes + ---------- + cache_dir : Path + Directory to cache downloaded models from HuggingFace. + default_commit_message : str + Default commit message when uploading to HuggingFace. + model_filename : str + Filename for serialized model in HuggingFace repos. + metadata_filename : str + Filename for OpenML metadata in HuggingFace repos. + """ + + cache_dir: Path | None = None + default_commit_message: str = "Upload from OpenML" + model_filename: str = "model.pkl" + metadata_filename: str = "openml_metadata.json" + readme_filename: str = "README.md" + + def __post_init__(self) -> None: + """Initialize cache directory.""" + if self.cache_dir is None: + # Use OpenML cache directory + huggingface subdirectory + self.cache_dir = Path(openml.config.get_cache_directory()) / "huggingface" + + # Ensure cache directory exists + self.cache_dir.mkdir(parents=True, exist_ok=True) + + +# Global configuration instance +_config = HuggingFaceConfig() + + +def get_config() -> HuggingFaceConfig: + """Get the current HuggingFace integration configuration. + + Returns + ------- + HuggingFaceConfig + Current configuration object. + """ + return _config + + +def set_cache_directory(path: str | Path) -> None: + """Set the cache directory for HuggingFace downloads. + + Parameters + ---------- + path : str or Path + Path to cache directory. + """ + _config.cache_dir = Path(path) + _config.cache_dir.mkdir(parents=True, exist_ok=True) + + +def reset_config() -> None: + """Reset configuration to defaults. + + Note: This recreates the configuration by reinitializing fields. + """ + _config.cache_dir = None + _config.default_commit_message = "Upload from OpenML" + _config.model_filename = "model.pkl" + _config.metadata_filename = "openml_metadata.json" + _config.readme_filename = "README.md" + _config.__post_init__() diff --git a/openml/extensions/huggingface/functions.py b/openml/extensions/huggingface/functions.py new file mode 100644 index 000000000..021f85452 --- /dev/null +++ b/openml/extensions/huggingface/functions.py @@ -0,0 +1,336 @@ +"""Core functions for HuggingFace Hub integration.""" + +from __future__ import annotations + +import json +import pickle +import tempfile +from pathlib import Path +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from openml.flows import OpenMLFlow + +try: + from huggingface_hub import HfApi, create_repo, hf_hub_download + + HUGGINGFACE_AVAILABLE = True +except ImportError: + HUGGINGFACE_AVAILABLE = False + +from openml.exceptions import PyOpenMLError + +from .config import get_config + + +def _check_huggingface_available() -> None: + """Check if huggingface_hub is installed.""" + if not HUGGINGFACE_AVAILABLE: + raise ImportError( + "HuggingFace Hub integration requires 'huggingface_hub'. " + "Install with: pip install huggingface_hub" + ) + + +def upload_flow_to_huggingface( + flow: OpenMLFlow, + repo_id: str, + token: str, + *, + private: bool = False, + commit_message: str | None = None, +) -> str: + """Upload an OpenML flow to HuggingFace Hub. + + This function creates a model repository on HuggingFace Hub and uploads: + 1. The serialized model (pickle format) + 2. OpenML flow metadata (JSON) + 3. A model card with documentation + + Parameters + ---------- + flow : OpenMLFlow + OpenML flow to upload. Must have a valid flow_id (i.e., published to OpenML). + repo_id : str + Repository name in format 'username/repo-name' or 'organization/repo-name'. + token : str + HuggingFace API token with write access. + private : bool, default=False + Whether to create a private repository. + commit_message : str, optional + Custom commit message. If None, uses default from config. + + Returns + ------- + str + URL of the uploaded model on HuggingFace Hub. + + Raises + ------ + ImportError + If huggingface_hub is not installed. + PyOpenMLError + If the flow has no flow_id or model. + + Examples + -------- + >>> import openml + >>> from openml.extensions.huggingface import upload_flow_to_huggingface + >>> + >>> # Get a flow from OpenML + >>> flow = openml.flows.get_flow(12345, reinstantiate=True) + >>> + >>> # Upload to HuggingFace + >>> url = upload_flow_to_huggingface( + ... flow=flow, + ... repo_id="my-username/my-sklearn-model", + ... token="hf_xxxxx", + ... private=False, + ... ) + >>> print(f"Model uploaded to: {url}") + """ + _check_huggingface_available() + + config = get_config() + + if flow.flow_id is None: + raise PyOpenMLError( + "Flow must be published to OpenML before uploading to HuggingFace. " + "Use flow.publish() first." + ) + + if flow.model is None: + raise PyOpenMLError( + "Flow must have a model instance. " + "Use openml.flows.get_flow(flow_id, reinstantiate=True)." + ) + + # Create repository + api = HfApi() + try: + create_repo( + repo_id=repo_id, + token=token, + private=private, + repo_type="model", + exist_ok=True, + ) + except Exception as e: + raise PyOpenMLError(f"Failed to create HuggingFace repository: {e}") from e + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir_path = Path(tmpdir) + + # 1. Save the model + model_path = tmpdir_path / config.model_filename + with model_path.open("wb") as f: + pickle.dump(flow.model, f) + + # 2. Save OpenML metadata + metadata = { + "openml_flow_id": flow.flow_id, + "openml_flow_name": flow.name, + "openml_url": f"https://www.openml.org/f/{flow.flow_id}", + "flow_description": flow.description, + "dependencies": flow.dependencies, + "parameters": flow.parameters, + "external_version": flow.external_version, + } + metadata_path = tmpdir_path / config.metadata_filename + with metadata_path.open("w") as f: + json.dump(metadata, f, indent=2) + + # 3. Create model card + model_card = _create_model_card(flow) + card_path = tmpdir_path / config.readme_filename + with card_path.open("w") as f: + f.write(model_card) + + # Upload files + commit_msg = commit_message or config.default_commit_message + + try: + for file_path in [model_path, metadata_path, card_path]: + api.upload_file( + path_or_fileobj=str(file_path), + path_in_repo=file_path.name, + repo_id=repo_id, + token=token, + commit_message=commit_msg, + ) + except Exception as e: + raise PyOpenMLError(f"Failed to upload files to HuggingFace: {e}") from e + + return f"https://huggingface.co/{repo_id}" + + +def download_flow_from_huggingface( + repo_id: str, + token: str | None = None, + local_dir: str | Path | None = None, +) -> dict[str, Any]: + """Download a model and its OpenML metadata from HuggingFace Hub. + + Parameters + ---------- + repo_id : str + Repository name in format 'username/repo-name'. + token : str, optional + HuggingFace API token (required for private repos). + local_dir : str or Path, optional + Directory to save downloaded files. If None, uses cache directory from config. + + Returns + ------- + dict + Dictionary containing: + - 'model': The deserialized model object + - 'metadata': OpenML flow metadata (dict) + - 'model_path': Path to downloaded model file + - 'metadata_path': Path to metadata file + + Raises + ------ + ImportError + If huggingface_hub is not installed. + FileNotFoundError + If required files are not found in the repository. + + Examples + -------- + >>> from openml.extensions.huggingface import download_flow_from_huggingface + >>> + >>> # Download model and metadata + >>> result = download_flow_from_huggingface("my-username/my-sklearn-model") + >>> model = result['model'] + >>> metadata = result['metadata'] + >>> + >>> print(f"Original OpenML Flow ID: {metadata['openml_flow_id']}") + """ + _check_huggingface_available() + + config = get_config() + + if local_dir is None: + cache_dir = config.cache_dir + if cache_dir is None: + raise RuntimeError("Cache directory is not configured") + local_dir = cache_dir / repo_id.replace("/", "_") + + local_dir = Path(local_dir) + local_dir.mkdir(parents=True, exist_ok=True) + + try: + # Download model + model_path = hf_hub_download( + repo_id=repo_id, + filename=config.model_filename, + token=token, + local_dir=str(local_dir), + ) + + # Download metadata + metadata_path = hf_hub_download( + repo_id=repo_id, + filename=config.metadata_filename, + token=token, + local_dir=str(local_dir), + ) + except Exception as e: + raise FileNotFoundError( + f"Failed to download model from {repo_id}. " + f"Make sure the repository exists and contains the required files. " + f"Error: {e}" + ) from e + + # Load model + # Note: pickle.load can be unsafe with untrusted data. + # Only use with models from trusted sources. + model_path_obj = Path(model_path) + with model_path_obj.open("rb") as f: + model = pickle.load(f) # noqa: S301 + + # Load metadata + metadata_path_obj = Path(metadata_path) + with metadata_path_obj.open() as f: + metadata = json.load(f) + + return { + "model": model, + "metadata": metadata, + "model_path": model_path, + "metadata_path": metadata_path, + } + + +def _create_model_card(flow: OpenMLFlow) -> str: + """Create a HuggingFace model card for an OpenML flow.""" + card = f"""--- +tags: +- openml +- scikit-learn +- machine-learning +library_name: sklearn +--- + +# {flow.name} + +This model was uploaded from [OpenML](https://www.openml.org/f/{flow.flow_id}). + +## Model Description + +{flow.description or "No description provided."} + +## OpenML Information + +- **Flow ID**: {flow.flow_id} +- **Flow Name**: {flow.name} +- **External Version**: {flow.external_version} +- **OpenML URL**: https://www.openml.org/f/{flow.flow_id} + +## Dependencies + +``` +{flow.dependencies or "No dependencies listed."} +``` + +## Parameters + +""" + + if flow.parameters: + for param_name, param_value in flow.parameters.items(): + card += f"- `{param_name}`: {param_value}\n" + else: + card += "No parameters defined.\n" + + card += """ +## Usage + +```python +import pickle +from huggingface_hub import hf_hub_download + +# Download the model +model_path = hf_hub_download(repo_id="REPO_ID", filename="model.pkl") + +# Load the model +with open(model_path, "rb") as f: + model = pickle.load(f) + +# Use the model +# predictions = model.predict(X) +``` + +## Citation + +```bibtex +@misc{openml, + author = {OpenML}, + title = {OpenML: Open Machine Learning}, + year = {2023}, + url = {https://www.openml.org} +} +``` +""" + return card diff --git a/tests/test_extensions/test_huggingface.py b/tests/test_extensions/test_huggingface.py new file mode 100644 index 000000000..d9020142e --- /dev/null +++ b/tests/test_extensions/test_huggingface.py @@ -0,0 +1,163 @@ +"""Tests for HuggingFace integration.""" +import json +import pickle +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from openml.exceptions import PyOpenMLError +from openml.flows import OpenMLFlow + + +class TestHuggingFaceIntegration: + """Test suite for HuggingFace Hub integration.""" + + @pytest.fixture + def mock_flow(self): + """Create a mock OpenML flow for testing.""" + flow = MagicMock(spec=OpenMLFlow) + flow.flow_id = 12345 + flow.name = "sklearn.ensemble.RandomForestClassifier" + flow.description = "A random forest classifier" + flow.external_version = "1.0.0" + flow.dependencies = "scikit-learn==1.3.0" + flow.parameters = {"n_estimators": "100", "max_depth": "10"} + flow.model = MagicMock() # Mock model object + return flow + + def test_upload_flow_without_flow_id_raises_error(self, mock_flow): + """Test that uploading a flow without flow_id raises PyOpenMLError.""" + from openml.extensions.huggingface import upload_flow_to_huggingface + + mock_flow.flow_id = None + + with pytest.raises(PyOpenMLError, match="must be published"): + upload_flow_to_huggingface( + flow=mock_flow, + repo_id="test/repo", + token="fake_token", + ) + + def test_upload_flow_without_model_raises_error(self, mock_flow): + """Test that uploading a flow without model raises PyOpenMLError.""" + from openml.extensions.huggingface import upload_flow_to_huggingface + + mock_flow.model = None + + with pytest.raises(PyOpenMLError, match="must have a model"): + upload_flow_to_huggingface( + flow=mock_flow, + repo_id="test/repo", + token="fake_token", + ) + + @patch("openml.extensions.huggingface.functions.pickle.dump") + @patch("openml.extensions.huggingface.functions.create_repo") + @patch("openml.extensions.huggingface.functions.HfApi") + def test_upload_creates_correct_files( + self, mock_hf_api, mock_create_repo, mock_pickle_dump, mock_flow + ): + """Test that upload creates model.pkl, metadata.json, and README.md.""" + from openml.extensions.huggingface import upload_flow_to_huggingface + + mock_api_instance = MagicMock() + mock_hf_api.return_value = mock_api_instance + + upload_flow_to_huggingface( + flow=mock_flow, + repo_id="test/repo", + token="fake_token", + ) + + # Verify create_repo was called + mock_create_repo.assert_called_once() + + # Verify pickle.dump was called (for the model) + mock_pickle_dump.assert_called_once() + + # Verify upload_file was called 3 times (model, metadata, README) + assert mock_api_instance.upload_file.call_count == 3 + + # Verify the files have correct names + call_args_list = mock_api_instance.upload_file.call_args_list + uploaded_files = [call.kwargs.get("path_in_repo") for call in call_args_list] + + assert "model.pkl" in uploaded_files + assert "openml_metadata.json" in uploaded_files + assert "README.md" in uploaded_files + + def test_model_card_generation(self, mock_flow): + """Test that model card is generated correctly.""" + from openml.extensions.huggingface.functions import _create_model_card + + card = _create_model_card(mock_flow) + + assert "sklearn.ensemble.RandomForestClassifier" in card + assert "12345" in card + assert "n_estimators" in card + assert "https://www.openml.org/f/12345" in card + + @patch("openml.extensions.huggingface.functions.hf_hub_download") + def test_download_flow_loads_correct_files(self, mock_download): + """Test that download correctly loads model and metadata.""" + from openml.extensions.huggingface import download_flow_from_huggingface + + # Create temporary files + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir_path = Path(tmpdir) + + # Create mock model file + model_path = tmpdir_path / "model.pkl" + mock_model = {"type": "RandomForest"} + with model_path.open("wb") as f: + pickle.dump(mock_model, f) + + # Create mock metadata file + metadata_path = tmpdir_path / "openml_metadata.json" + mock_metadata = {"openml_flow_id": 12345} + with metadata_path.open("w") as f: + json.dump(mock_metadata, f) + + # Mock hf_hub_download to return our temp files + def side_effect(repo_id, filename, **kwargs): + if filename == "model.pkl": + return str(model_path) + elif filename == "openml_metadata.json": + return str(metadata_path) + return None + + mock_download.side_effect = side_effect + + # Test download + result = download_flow_from_huggingface("test/repo") + + assert result["model"] == mock_model + assert result["metadata"]["openml_flow_id"] == 12345 + + def test_config_initialization(self): + """Test that config initializes correctly.""" + from openml.extensions.huggingface.config import get_config, reset_config + + reset_config() + config = get_config() + + assert config.model_filename == "model.pkl" + assert config.metadata_filename == "openml_metadata.json" + assert config.readme_filename == "README.md" + assert config.cache_dir is not None + + def test_config_cache_directory_setting(self): + """Test setting custom cache directory.""" + from openml.extensions.huggingface.config import ( + get_config, + set_cache_directory, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + set_cache_directory(tmpdir) + config = get_config() + + assert str(config.cache_dir) == tmpdir + assert config.cache_dir.exists() \ No newline at end of file