diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml new file mode 100644 index 0000000..dfc2cc8 --- /dev/null +++ b/.github/workflows/docker-publish.yml @@ -0,0 +1,28 @@ +name: Build and push Docker image to DockerHub + +on: + push: + branches: [ "main" ] + workflow_dispatch: # allows manual trigger + +jobs: + push_to_registry: + name: Push Docker image to Docker Hub + runs-on: ubuntu-latest + + steps: + - name: Check out the repo + uses: actions/checkout@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DBP_DOCKERHUB_CREDENTIAL_USERNAME }} + password: ${{ secrets.DBP_DOCKERHUB_CREDENTIAL_TOKEN_PUSHIMAGES }} + + - name: Build and push Docker image + uses: docker/build-push-action@v6 + with: + context: . + push: true + tags: dbpedia/databus-python-client:latest diff --git a/README.md b/README.md index 3782a1c..d7df5f5 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,77 @@ # Databus Client Python -## Install +## Quickstart Example +Commands to download the DBpedia Knowledge Graphs generated by Live Fusion. +DBpedia Live Fusion publishes two different kinds of KGs: + +1. Open Core Knowledge Graphs under CC-BY-SA license, open with copyleft/share-alike, no registration needed +2. Industry Knowledge Graphs under BUSL 1.1 license, unrestricted for research and experimentation, commercial license for productive use, free registration needed. + + +### Registration (Access Token) + +1. If you do not have a DBpedia Account yet (Forum/Databus), please register at https://account.dbpedia.org +2. Login at https://account.dbpedia.org and create your token. +3. Save the token to a file `vault-token.dat`. + +### Docker vs. Python +The databus-python-client comes as **docker** or **python** with these patterns. +`$DOWNLOADTARGET` can be any Databus URI including collections OR SPARQL query (or several thereof). Details are documented below. ```bash +# Docker +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download $DOWNLOADTARGET --token vault-token.dat +# Python python3 -m pip install databusclient +databusclient download $DOWNLOADTARGET --token vault-token.dat +``` + +### Download Live Fusion KG Snapshot (BUSL 1.1, registration needed) +TODO One slogan sentence. [More information](https://databus.dbpedia.org/dbpedia-enterprise/live-fusion-kg-snapshot) +```bash +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia-enterprise/live-fusion-kg-snapshot --token vault-token.dat +``` + +### Download Enriched Knowledge Graphs (BUSL 1.1, registration needed) +**DBpedia Wikipedia Extraction Enriched** +TODO One slogan sentence and link +Currently EN DBpedia only. + +```bash +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia-enterprise/dbpedia-wikipedia-kg-enriched-snapshot --token vault-token.dat +``` +**DBpedia Wikidata Extraction Enriched** +TODO One slogan sentence and link + +```bash +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia-enterprise/dbpedia-wikidata-kg-enriched-snapshot --token vault-token.dat +``` + +### Download DBpedia Wikipedia Knowledge Graphs (CC-BY-SA, no registration needed) +TODO One slogan sentence and link + +```bash +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia/dbpedia-wikipedia-kg-snapshot +``` +### Download DBpedia Wikidata Knowledge Graphs (CC-BY-SA, no registration needed) +TODO One slogan sentence and link + +```bash +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia/dbpedia-wikidata-kg-snapshot ``` +## Docker Image Usage + +A docker image is available at [dbpedia/databus-python-client](https://hub.docker.com/r/dbpedia/databus-python-client). See [download section](#usage-of-docker-image) for details. + + ## CLI Usage + +**Installation** +```bash +python3 -m pip install databusclient +``` + +**Running** ```bash databusclient --help ``` @@ -26,47 +92,7 @@ Commands: download ``` -## Docker Image Usage -A docker image is available at [dbpedia/databus-python-client](https://hub.docker.com/r/dbpedia/databus-python-client). See [download section](#usage-of-docker-image) for details. - -### Deploy command -``` -databusclient deploy --help -``` -``` -Usage: databusclient deploy [OPTIONS] DISTRIBUTIONS... - -Arguments: - DISTRIBUTIONS... distributions in the form of List[URL|CV|fileext|compression|sha256sum:contentlength] where URL is the - download URL and CV the key=value pairs (_ separted) - content variants of a distribution, fileExt and Compression can be set, if not they are inferred from the path [required] - -Options: - --version-id TEXT Target databus version/dataset identifier of the form [required] - --title TEXT Dataset title [required] - --abstract TEXT Dataset abstract max 200 chars [required] - --description TEXT Dataset description [required] - --license TEXT License (see dalicc.net) [required] - --apikey TEXT API key [required] - --help Show this message and exit. -``` -Examples of using deploy command -``` -databusclient deploy --version-id https://databus.dbpedia.org/user1/group1/artifact1/2022-05-18 --title title1 --abstract abstract1 --description description1 --license http://dalicc.net/licenselibrary/AdaptivePublicLicense10 --apikey MYSTERIOUS 'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger' -``` - -``` -databusclient deploy --version-id https://dev.databus.dbpedia.org/denis/group1/artifact1/2022-05-18 --title "Client Testing" --abstract "Testing the client...." --description "Testing the client...." --license http://dalicc.net/licenselibrary/AdaptivePublicLicense10 --apikey MYSTERIOUS 'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger' -``` - -A few more notes for CLI usage: - -* The content variants can be left out ONLY IF there is just one distribution - * For complete inferred: Just use the URL with `https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml` - * If other parameters are used, you need to leave them empty like `https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml||yml|7a751b6dd5eb8d73d97793c3c564c71ab7b565fa4ba619e4a8fd05a6f80ff653:367116` ### Download command ``` @@ -132,6 +158,46 @@ databusclient download https://databus.dbpedia.org/dbpedia/collections/dbpedia-s databusclient download 'PREFIX dcat: SELECT ?x WHERE { ?sub dcat:downloadURL ?x . } LIMIT 10' --databus https://databus.dbpedia.org/sparql ``` +### Deploy command +``` +databusclient deploy --help +``` +``` +Usage: databusclient deploy [OPTIONS] DISTRIBUTIONS... + +Arguments: + DISTRIBUTIONS... distributions in the form of List[URL|CV|fileext|compression|sha256sum:contentlength] where URL is the + download URL and CV the key=value pairs (_ separted) + content variants of a distribution, fileExt and Compression can be set, if not they are inferred from the path [required] + +Options: + --version-id TEXT Target databus version/dataset identifier of the form [required] + --title TEXT Dataset title [required] + --abstract TEXT Dataset abstract max 200 chars [required] + --description TEXT Dataset description [required] + --license TEXT License (see dalicc.net) [required] + --apikey TEXT API key [required] + --help Show this message and exit. +``` +Examples of using deploy command +``` +databusclient deploy --version-id https://databus.dbpedia.org/user1/group1/artifact1/2022-05-18 --title title1 --abstract abstract1 --description description1 --license http://dalicc.net/licenselibrary/AdaptivePublicLicense10 --apikey MYSTERIOUS 'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger' +``` + +``` +databusclient deploy --version-id https://dev.databus.dbpedia.org/denis/group1/artifact1/2022-05-18 --title "Client Testing" --abstract "Testing the client...." --description "Testing the client...." --license http://dalicc.net/licenselibrary/AdaptivePublicLicense10 --apikey MYSTERIOUS 'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger' +``` + +A few more notes for CLI usage: + +* The content variants can be left out ONLY IF there is just one distribution + * For complete inferred: Just use the URL with `https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml` + * If other parameters are used, you need to leave them empty like `https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml||yml|7a751b6dd5eb8d73d97793c3c564c71ab7b565fa4ba619e4a8fd05a6f80ff653:367116` + + + #### Authentication with vault For downloading files from the vault, you need to provide a vault token. See [getting-the-access-refresh-token](https://github.com/dbpedia/databus-vault-access?tab=readme-ov-file#step-1-getting-the-access-refresh-token) for details. You can come back here once you have a `vault-token.dat` file. To use it, just provide the path to the file with `--token /path/to/vault-token.dat`. @@ -155,8 +221,100 @@ If using vault authentication, make sure the token file is available in the cont docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia-enterprise/live-fusion-snapshots/fusion/2025-08-23/fusion_props=all_subjectns=commons-wikimedia-org_vocab=all.ttl.gz --token vault-token.dat ``` -## Module Usage +### Upload-and-deploy command +```bash +databusclient upload-and-deploy --help +``` +```text +Usage: databusclient upload-and-deploy [OPTIONS] [FILES]... + + Upload files to Nextcloud and deploy to DBpedia Databus. + +Arguments: + FILES... files in the form of List[path], where every path must exist locally, which will be uploaded and deployed + +Options: + --webdav-url TEXT WebDAV URL (e.g., + https://cloud.example.com/remote.php/webdav) + --remote TEXT rclone remote name (e.g., 'nextcloud') + --path TEXT Remote path on Nextcloud (e.g., 'datasets/mydataset') + --no-upload Skip file upload and use existing metadata + --metadata PATH Path to metadata JSON file (required if --no-upload is + used) + --version-id TEXT Target databus version/dataset identifier of the form [required] + --title TEXT Dataset title [required] + --abstract TEXT Dataset abstract max 200 chars [required] + --description TEXT Dataset description [required] + --license TEXT License (see dalicc.net) [required] + --apikey TEXT API key [required] + --help Show this message and exit. +``` +The script uploads all given files and all files in the given folders to the given remote. +Then registers them on the databus. + + +#### Example of using upload-and-deploy command + +```bash +databusclient upload-and-deploy \ +--webdav-url https://cloud.scadsai.uni-leipzig.de/remote.php/webdav \ +--remote scads-nextcloud \ +--path test \ +--version-id https://databus.org/user/dataset/version/1.0 \ +--title "Test Dataset" \ +--abstract "This is a short abstract of the test dataset." \ +--description "This dataset was uploaded for testing the Nextcloud → Databus deployment pipeline." \ +--license https://dalicc.net/licenselibrary/Apache-2.0 \ +--apikey "API-KEY" \ +/home/test \ +/home/test_folder/test +``` + + +### deploy-with-metadata command +```bash +databusclient deploy-with-metadata --help +``` +```text +Usage: databusclient deploy-with-metadata [OPTIONS] + + Deploy to DBpedia Databus using metadata json file. + +Options: + --metadata PATH Path to metadata JSON file [required] + --version-id TEXT Target databus version/dataset identifier of the form [required] + --title TEXT Dataset title [required] + --abstract TEXT Dataset abstract max 200 chars [required] + --description TEXT Dataset description [required] + --license TEXT License (see dalicc.net) [required] + --apikey TEXT API key [required] + --help Show this message and exit. +``` + +Use the metadata.json file (see [databusclient/metadata.json](databusclient/metadata.json)) to list all files which should be added to the databus. +The script registers all files on the databus. + + +#### Examples of using deploy-with-metadata command + +```bash +databusclient deploy-with-metadata \ + --metadata /home/metadata.json \ + --version-id https://databus.org/user/dataset/version/1.0 \ + --title "Test Dataset" \ + --abstract "This is a short abstract of the test dataset." \ + --description "This dataset was uploaded for testing the Nextcloud → Databus deployment pipeline." \ + --license https://dalicc.net/licenselibrary/Apache-2.0 \ + --apikey "API-KEY" +``` + + +## Module Usage ### Step 1: Create lists of distributions for the dataset ```python diff --git a/databusclient/cli.py b/databusclient/cli.py index 8fc3e02..77e1bca 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -1,8 +1,11 @@ #!/usr/bin/env python3 +import json + import click from typing import List from databusclient import client +from nextcloudclient import upload @click.group() def app(): @@ -36,6 +39,78 @@ def deploy(version_id, title, abstract, description, license_url, apikey, distri client.deploy(dataid=dataid, api_key=apikey) +@app.command() +@click.option( + "--metadata", "metadata_file", + required=True, + type=click.Path(exists=True), + help="Path to metadata JSON file", +) +@click.option( + "--version-id", "version_id", + required=True, + help="Target databus version/dataset identifier of the form " + "", +) +@click.option("--title", required=True, help="Dataset title") +@click.option("--abstract", required=True, help="Dataset abstract max 200 chars") +@click.option("--description", required=True, help="Dataset description") +@click.option("--license", "license_url", required=True, help="License (see dalicc.net)") +@click.option("--apikey", required=True, help="API key") +def deploy_with_metadata(metadata_file, version_id, title, abstract, description, license_url, apikey): + """ + Deploy to DBpedia Databus using metadata json file. + """ + + with open(metadata_file, 'r') as f: + metadata = json.load(f) + + client.deploy_from_metadata(metadata, version_id, title, abstract, description, license_url, apikey) + + +@app.command() +@click.option( + "--webdav-url", "webdav_url", + required=True, + help="WebDAV URL (e.g., https://cloud.example.com/remote.php/webdav)", +) +@click.option( + "--remote", + required=True, + help="rclone remote name (e.g., 'nextcloud')", +) +@click.option( + "--path", + required=True, + help="Remote path on Nextcloud (e.g., 'datasets/mydataset')", +) +@click.option( + "--version-id", "version_id", + required=True, + help="Target databus version/dataset identifier of the form " + "", +) +@click.option("--title", required=True, help="Dataset title") +@click.option("--abstract", required=True, help="Dataset abstract max 200 chars") +@click.option("--description", required=True, help="Dataset description") +@click.option("--license", "license_url", required=True, help="License (see dalicc.net)") +@click.option("--apikey", required=True, help="API key") +@click.argument( + "files", + nargs=-1, + type=click.Path(exists=True), +) +def upload_and_deploy(webdav_url, remote, path, version_id, title, abstract, description, license_url, apikey, + files: List[str]): + """ + Upload files to Nextcloud and deploy to DBpedia Databus. + """ + + click.echo(f"Uploading data to nextcloud: {remote}") + metadata = upload.upload_to_nextcloud(files, remote, path, webdav_url) + client.deploy_from_metadata(metadata, version_id, title, abstract, description, license_url, apikey) + + @app.command() @click.argument("databusuris", nargs=-1, required=True) @click.option("--localdir", help="Local databus folder (if not given, databus folder structure is created in current working directory)") diff --git a/databusclient/client.py b/databusclient/client.py index 764bf6b..79e19ab 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -205,6 +205,56 @@ def create_distribution( return f"{url}|{meta_string}" +def create_distributions_from_metadata(metadata): + distributions = [] + counter = 0 + for entry in metadata: + filename = entry["filename"] + checksum = entry["checksum"] + size = entry["size"] + if not isinstance(size, int) or size <= 0: + raise ValueError(f"Invalid size for {filename}: expected positive integer, got {size}") + url = entry["url"] + # Validate SHA-256 hex digest (64 hex chars) + if not isinstance(checksum, str) or len(checksum) != 64 or not all( + c in '0123456789abcdefABCDEF' for c in checksum): + raise ValueError(f"Invalid checksum for {filename}") + # Known compression extensions + COMPRESSION_EXTS = {"gz", "bz2", "xz", "zip", "7z", "tar", "lz", "zst"} + + parts = filename.split(".") + if len(parts) == 1: + file_format = "none" + compression = "none" + elif len(parts) == 2: + file_format = parts[-1] + compression = "none" + else: + # Check if last part is a known compression + + if parts[-1] in COMPRESSION_EXTS: + compression = parts[-1] + # Handle compound extensions like .tar.gz + if len(parts) > 2 and parts[-2] in COMPRESSION_EXTS: + file_format = parts[-3] if len(parts) > 3 else "file" + else: + file_format = parts[-2] + else: + file_format = parts[-1] + compression = "none" + + distributions.append( + create_distribution( + url=url, + cvs={"count": f"{counter}"}, + file_format=file_format, + compression=compression, + sha256_length_tuple=(checksum, size) + ) + ) + counter += 1 + return distributions + def create_dataset( version_id: str, @@ -393,6 +443,25 @@ def deploy( print(resp.text) +def deploy_from_metadata(metadata, version_id, title, abstract, description, license_url, apikey): + distributions = create_distributions_from_metadata(metadata) + + dataset = create_dataset( + version_id=version_id, + title=title, + abstract=abstract, + description=description, + license_url=license_url, + distributions=distributions + ) + + print(f"Deploying dataset version: {version_id}") + deploy(dataset, apikey) + + metadata_string = ",\n".join(entry["url"] for entry in metadata) + print(f"Successfully deployed\n{metadata_string}\nto databus {version_id}") + + def __download_file__(url, filename, vault_token_file=None, auth_url=None, client_id=None) -> None: """ Download a file from the internet with a progress bar using tqdm. @@ -635,7 +704,7 @@ def __download_list__(urls: List[str], def __get_databus_id_parts__(uri: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str], Optional[str], Optional[str]]: uri = uri.removeprefix("https://").removeprefix("http://") parts = uri.strip("/").split("/") - parts += [None] * (6 - len(parts)) # pad with None if less than 6 parts + parts += [None] * (6 - len(parts)) # pad fwith None if less than 6 parts return tuple(parts[:6]) # return only the first 6 parts diff --git a/databusclient/metadata.json b/databusclient/metadata.json new file mode 100644 index 0000000..64363d2 --- /dev/null +++ b/databusclient/metadata.json @@ -0,0 +1,14 @@ +[ + { + "filename": "example.ttl", + "checksum": "0929436d44bba110fc7578c138ed770ae9f548e195d19c2f00d813cca24b9f39", + "size": 12345, + "url": "https://cloud.example.com/remote.php/webdav/datasets/mydataset/example.ttl" + }, + { + "filename": "example.csv.gz", + "checksum": "2238acdd7cf6bc8d9c9963a9f6014051c754bf8a04aacc5cb10448e2da72c537", + "size": 54321, + "url": "https://cloud.example.com/remote.php/webdav/datasets/mydataset/example.csv.gz" + } +] diff --git a/nextcloudclient/__init__.py b/nextcloudclient/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/nextcloudclient/upload.py b/nextcloudclient/upload.py new file mode 100644 index 0000000..f0d3328 --- /dev/null +++ b/nextcloudclient/upload.py @@ -0,0 +1,82 @@ +import hashlib +import os +import subprocess +import posixpath +from urllib.parse import urljoin, quote + + +def compute_sha256_and_length(filepath): + sha256 = hashlib.sha256() + total_length = 0 + with open(filepath, 'rb') as f: + while True: + chunk = f.read(4096) + if not chunk: + break + sha256.update(chunk) + total_length += len(chunk) + return sha256.hexdigest(), total_length + +def get_all_files(path): + if os.path.isfile(path): + return [path] + files = [] + for root, _, filenames in os.walk(path): + for name in filenames: + files.append(os.path.join(root, name)) + return files + +def upload_to_nextcloud(source_paths: list[str], remote_name: str, remote_path: str, webdav_url: str): + result = [] + for path in source_paths: + if not os.path.exists(path): + print(f"Path not found: {path}") + continue + + abs_path = os.path.abspath(path) + basename = os.path.basename(abs_path) + files = get_all_files(abs_path) + + tmp_results = [] + + for file in files: + checksum,size = compute_sha256_and_length(file) + + if os.path.isdir(path): + rel_file = os.path.relpath(file, abs_path) + # Normalize to POSIX for WebDAV/URLs + rel_file = rel_file.replace(os.sep, "/") + remote_webdav_path = posixpath.join(remote_path, basename, rel_file) + else: + remote_webdav_path = posixpath.join(remote_path, os.path.basename(file)) + + # Preserve scheme/host and percent-encode path segments + url = urljoin(webdav_url.rstrip("/") + "/", quote(remote_webdav_path.lstrip("/"), safe="/")) + + filename = os.path.basename(file) + tmp_results.append({ + "filename": filename, + "checksum": checksum, + "size": size, + "url": url, + }) + + dest_subpath = posixpath.join(remote_path.lstrip("/"), basename) + if os.path.isdir(path): + destination = f"{remote_name}:{dest_subpath}" + command = ["rclone", "copy", abs_path, destination, "--progress"] + else: + destination = f"{remote_name}:{dest_subpath}" + command = ["rclone", "copyto", abs_path, destination, "--progress"] + + print(f"Upload: {path} → {destination}") + try: + subprocess.run(command, check=True) + result.extend(tmp_results) + print("✅ Uploaded successfully.\n") + except subprocess.CalledProcessError as e: + print(f"❌ Error uploading {path}: {e}\n") + except FileNotFoundError: + print("❌ rclone not found on PATH. Install rclone and retry.") + + return result diff --git a/poetry.lock b/poetry.lock index c5b6e69..b4b80af 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. [[package]] name = "black" @@ -442,7 +442,7 @@ description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version == \"3.9\"" +markers = "python_version < \"3.10\"" files = [ {file = "typing_extensions-4.9.0-py3-none-any.whl", hash = "sha256:af72aea155e91adfc61c3ae9e0e342dbc0cba726d6cba4b6c72c1f34e47291cd"}, {file = "typing_extensions-4.9.0.tar.gz", hash = "sha256:23478f88c37f27d76ac8aee6c905017a143b0b1b886c3c9f66bc2fd94f9f5783"},