diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index e0f247d..3d3bc95 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -32,7 +32,7 @@ jobs: platform: - ubuntu-latest - macos-latest - - windows-latest + # - windows-latest runs-on: ${{ matrix.platform }} name: Python ${{ matrix.python }}, ${{ matrix.platform }} steps: diff --git a/CHANGELOG.md b/CHANGELOG.md index 205cc5e..ea495e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,6 @@ # Changelog -## Version 0.1 (development) +## Version 0.0.1 -- Feature A added -- FIX: nasty bug #1729 fixed -- add your changes here! +- Initial implementation to access OrgDB objects. +- This also fetches the annotation hub sqlite file and queries for available org sqlite files instead of a static registry used in the txdb package. diff --git a/README.md b/README.md index 0ef054d..4ebd0a1 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,13 @@ [![PyPI-Server](https://img.shields.io/pypi/v/orgdb.svg)](https://pypi.org/project/orgdb/) -![Unit tests](https://github.com/YOUR_ORG_OR_USERNAME/orgdb/actions/workflows/run-tests.yml/badge.svg) +![Unit tests](https://github.com/BiocPy/orgdb/actions/workflows/run-tests.yml/badge.svg) # orgdb -> Access OrgDB annotations +**OrgDb** provides an interface to access and query **Organism Database (OrgDb)** SQLite files in Python. It mirrors functionality from the R/Bioconductor `AnnotationDbi` package, enabling seamless integration of organism-wide gene annotation into Python workflows. -A longer description of your project goes here... +> [!NOTE] +> +> If you are looking to access TxDb databases, check out the [txdb package](https://www.github.com/biocpy/txdb). ## Install @@ -15,6 +17,107 @@ To get started, install the package from [PyPI](https://pypi.org/project/orgdb/) pip install orgdb ``` +## Usage + +### Using OrgDbRegistry + +The registry download the AnnotationHub's metadata sqlite file and filters for all available OrgDb databases. You can fetch standard organism databases via the registry (backed by AnnotationHub). + +```py +from orgdb import OrgDbRegistry + +# Initialize registry and list available organisms +registry = OrgDbRegistry() +available = registry.list_orgdb() +print(available[:5]) +# ["org.'Caballeronia_concitans'.eg", "org.'Chlorella_vulgaris'_C-169.eg", ...] + +# Load the database for Homo sapiens (downloads and caches automatically) +db = registry.load_db("org.Hs.eg.db") +print(db.species) +# 'Homo sapiens' +``` + +### Inspecting metadata + +Explore the available columns and key types in the database. + +```py +# List available columns (and keytypes) +cols = db.columns() +print(cols[:5]) +# ['ENTREZID', 'PFAM', 'IPI', 'PROSITE', 'ACCNUM'] + +# Check available keys for a specific keytype +entrez_ids = db.keys("ENTREZID") +print(entrez_ids[:5]) +# ['1', '2', '9', '10', '11'] +``` + +### Querying Annotations (using `select`) + +The `select` method retrieves data as a `BiocFrame`. It automatically handles complex joins across tables. + +```py +# Retrieve Gene Symbols and Gene Names for a list of Entrez IDs +res = db.select( + keys=["1", "10"], + columns=["SYMBOL", "GENENAME"], + keytype="ENTREZID" +) + +print(res) +# BiocFrame with 2 rows and 3 columns + GENENAME ENTREZID SYMBOL + +# [0] alpha-1-B glycoprotein 1 A1BG +# [1] N-acetyltransferase 2 10 NAT2 + +``` + +> [!NOTE] +> +> If you request "GO" columns, the result will automatically expand to include "EVIDENCE" and "ONTOLOGY" columns, matching Bioconductor behavior. + +```py +go_res = db.select( + keys="1", + columns=["GO"], + keytype="ENTREZID" +) +# BiocFrame with 12 rows and 4 columns + ONTOLOGY ENTREZID GO EVIDENCE + +# [0] BP 1 GO:0002764 IBA +# [1] CC 1 GO:0005576 HDA +# [2] CC 1 GO:0005576 IDA +# ... ... ... ... +# [9] CC 1 GO:0070062 HDA +# [10] CC 1 GO:0072562 HDA +# [11] CC 1 GO:1904813 TAS +``` + +### Accessing Genomic Ranges + +Extract gene coordinates as a `GenomicRanges` object (requires the `chromosome_locations` table in the OrgDb database). + +```py +gr = db.genes() +print(gr) +# GenomicRanges with 52232 ranges and 1 metadata column +# seqnames ranges strand gene_id +# +# 1 19 -58345182 - -58336872 * | 1 +# 2 12 -9067707 - -9019495 * | 2 +# 2 12 -9067707 - -9019185 * | 2 +# ... ... ... | ... +# 116804918 11 121024101 - 121191490 * | 116804918 +# 117779438 1 20154213 - 20160568 * | 117779438 +# 118142757 6 42155405 - 42180056 * | 118142757 +# ------ +# seqinfo(369 sequences): 1 10 10_GL383545v1_alt ... X_KI270913v1_alt Y Y_KZ208924v1_fix +``` + ## Note diff --git a/docs/index.md b/docs/index.md index d46368c..672cafc 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,18 +1,14 @@ # orgdb -Access OrgDB annotations +**OrgDb** provides an interface to access and query **Organism Database (OrgDb)** SQLite files in Python. It mirrors functionality from the R/Bioconductor `AnnotationDbi` package, enabling seamless integration of organism-wide gene annotation into Python workflows. +## Install -## Note - -> This is the main page of your project's [Sphinx] documentation. It is -> formatted in [Markdown]. Add additional pages by creating md-files in -> `docs` or rst-files (formatted in [reStructuredText]) and adding links to -> them in the `Contents` section below. -> -> Please check [Sphinx] and [MyST] for more information -> about how to document your project and how to configure your preferences. +To get started, install the package from [PyPI](https://pypi.org/project/orgdb/) +```bash +pip install orgdb +``` ## Contents diff --git a/setup.cfg b/setup.cfg index f1f9240..d9721f7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -12,11 +12,11 @@ license = MIT license_files = LICENSE.txt long_description = file: README.md long_description_content_type = text/markdown; charset=UTF-8; variant=GFM -url = https://github.com/pyscaffold/pyscaffold/ +url = https://github.com/BiocPy/orgdb # Add here related links, for example: project_urls = - Documentation = https://pyscaffold.org/ -# Source = https://github.com/pyscaffold/pyscaffold/ + Documentation = https://github.com/BiocPy/orgdb + Source = https://github.com/BiocPy/orgdb # Changelog = https://pyscaffold.org/en/latest/changelog.html # Tracker = https://github.com/pyscaffold/pyscaffold/issues # Conda-Forge = https://anaconda.org/conda-forge/pyscaffold @@ -49,6 +49,9 @@ package_dir = # For more information, check out https://semver.org/. install_requires = importlib-metadata; python_version<"3.8" + genomicranges + biocframe + pybiocfilecache [options.packages.find] diff --git a/src/orgdb/__init__.py b/src/orgdb/__init__.py index e451f10..c04f046 100644 --- a/src/orgdb/__init__.py +++ b/src/orgdb/__init__.py @@ -14,3 +14,9 @@ __version__ = "unknown" finally: del version, PackageNotFoundError + +from .orgdb import OrgDb +from .orgdbregistry import OrgDbRegistry +from .record import OrgDbRecord + +__all__ = ["OrgDb", "OrgDbRegistry", "OrgDbRecord"] diff --git a/src/orgdb/_ahub.py b/src/orgdb/_ahub.py new file mode 100644 index 0000000..a85443a --- /dev/null +++ b/src/orgdb/_ahub.py @@ -0,0 +1,31 @@ +"""This list of OrgDB resources was generated from AnnotationHub. + +Code to generate: + +```bash +wget https://annotationhub.bioconductor.org/metadata/annotationhub.sqlite3 +sqlite3 annotationhub.sqlite3 +``` + +```sql +SELECT + r.title, + r.rdatadateadded, + lp.location_prefix || rp.rdatapath AS full_rdatapath +FROM resources r +LEFT JOIN location_prefixes lp + ON r.location_prefix_id = lp.id +LEFT JOIN rdatapaths rp + ON rp.resource_id = r.id +WHERE r.title LIKE 'org%.sqlite'; +``` + +Note: we only keep the latest version of these files. + +""" + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + +AHUB_METADATA_URL = "https://annotationhub.bioconductor.org/metadata/annotationhub.sqlite3" diff --git a/src/orgdb/orgdb.py b/src/orgdb/orgdb.py new file mode 100644 index 0000000..fa42d55 --- /dev/null +++ b/src/orgdb/orgdb.py @@ -0,0 +1,581 @@ +import sqlite3 +from typing import Dict, List, Union + +from biocframe import BiocFrame +from genomicranges import GenomicRanges +from iranges import IRanges + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +class OrgDb: + """Interface for accessing OrgDb SQLite databases in Python.""" + + def __init__(self, dbpath: str): + """Initialize the OrgDb object. + + Args: + dbpath: + Path to the SQLite database file. + """ + print(dbpath) + self.dbpath = dbpath + self.conn = sqlite3.connect(dbpath) + self.conn.row_factory = sqlite3.Row + self._metadata = None + self._table_map = self._define_tables() + + def _query_as_biocframe(self, query: str, params: tuple = ()) -> BiocFrame: + """Execute a SQL query and return the result as a BiocFrame.""" + cursor = self.conn.cursor() + cursor.execute(query, params) + results = cursor.fetchall() + + if not results: + if cursor.description: + col_names = [desc[0] for desc in cursor.description] + return BiocFrame({}, column_names=col_names) + return BiocFrame({}) + + col_names = [desc[0] for desc in cursor.description] + columns_data = list(zip(*results)) + + data_dict = {} + for i, name in enumerate(col_names): + data_dict[name] = list(columns_data[i]) + + return BiocFrame(data_dict) + + @property + def metadata(self) -> BiocFrame: + """Get the metadata table from the database.""" + if self._metadata is None: + self._metadata = self._query_as_biocframe("SELECT * FROM metadata") + return self._metadata + + @property + def species(self) -> str: + """Get the organism/species name from metadata.""" + meta = self.metadata + + if "name" in meta.column_names and "value" in meta.column_names: + names = meta.get_column("name") + values = meta.get_column("value") + for n, v in zip(names, values): + if n in ["ORGANISM", "Organism", "Genus and Species"]: + return v + return "Unknown" + + def _define_tables(self) -> Dict[str, tuple]: + """Define the mapping between column names and (table, field). + + Mirrors .definePossibleTables from R/methods-geneCentricDbs.R + """ + species = self.species + db_class = "OrgDb" + + # Mapping: COLUMN_NAME -> (TABLE_NAME, COLUMN_NAME) + mapping = { + "ENTREZID": ("genes", "gene_id"), + "PFAM": ("pfam", "pfam_id"), + "IPI": ("pfam", "ipi_id"), + "PROSITE": ("prosite", "prosite_id"), + "ACCNUM": ("accessions", "accession"), + "ALIAS": ("alias", "alias_symbol"), + "ALIAS2EG": ("alias", "alias_symbol"), + "ALIAS2PROBE": ("alias", "alias_symbol"), + "CHR": ("chromosomes", "chromosome"), + "CHRLOCCHR": ("chromosome_locations", "seqname"), + "CHRLOC": ("chromosome_locations", "start_location"), + "CHRLOCEND": ("chromosome_locations", "end_location"), + "ENZYME": ("ec", "ec_number"), + "MAP": ("cytogenetic_locations", "cytogenetic_location"), + "PATH": ("kegg", "path_id"), + "PMID": ("pubmed", "pubmed_id"), + "REFSEQ": ("refseq", "accession"), + "SYMBOL": ("gene_info", "symbol"), + "GENETYPE": ("genetype", "gene_type"), + "ENSEMBL": ("ensembl", "ensembl_id"), + "ENSEMBLPROT": ("ensembl_prot", "prot_id"), + "ENSEMBLTRANS": ("ensembl_trans", "trans_id"), + "GENENAME": ("gene_info", "gene_name"), + "UNIPROT": ("uniprot", "uniprot_id"), + "GO": ("go", "go_id"), + "EVIDENCE": ("go", "evidence"), + "ONTOLOGY": ("go", "ontology"), + "GOALL": ("go_all", "go_id"), + "EVIDENCEALL": ("go_all", "evidence"), + "ONTOLOGYALL": ("go_all", "ontology"), + } + + if db_class == "OrgDb": + if "ALIAS2PROBE" in mapping: + del mapping["ALIAS2PROBE"] + + if db_class == "ChipDb": + mapping["PROBEID"] = ("c.probes", "probe_id") + + if species == "Anopheles gambiae": + for k in [ + "ALIAS", + "ALIAS2PROBE", + "MAP", + "CHRLOC", + "CHRLOCEND", + "GENETYPE", + "CHRLOCCHR", + "PFAM", + "IPI", + "PROSITE", + ]: + mapping.pop(k, None) + + elif species == "Arabidopsis thaliana": + mapping.update( + { + "TAIR": ("genes", "gene_id"), + "ARACYC": ("aracyc", "pathway_name"), + "ARACYCENZYME": ("enzyme", "ec_name"), + } + ) + for k in [ + "ACCNUM", + "ALIAS", + "ALIAS2EG", + "ALIAS2PROBE", + "MAP", + "GENETYPE", + "PFAM", + "IPI", + "PROSITE", + "ENSEMBL", + "ENSEMBLPROT", + "ENSEMBLTRANS", + "UNIPROT", + "ENTREZID", + "CHR", + ]: + mapping.pop(k, None) + + # "re-add" these + mapping["ENTREZID"] = ("entrez_genes", "gene_id") + mapping["CHR"] = ("gene_info", "chromosome") + + elif species == "Bos taurus": + mapping.pop("MAP", None) + + elif species == "Caenorhabditis elegans": + mapping["WORMBASE"] = ("wormbase", "wormbase_id") + for k in ["MAP", "PFAM", "GENETYPE", "IPI", "PROSITE"]: + mapping.pop(k, None) + + elif species == "Canis familiaris": + for k in ["MAP", "PFAM", "IPI", "PROSITE"]: + mapping.pop(k, None) + + elif species == "Drosophila melanogaster": + mapping.update( + { + "FLYBASE": ("flybase", "flybase_id"), + "FLYBASECG": ("flybase_cg", "flybase_cg_id"), + "FLYBASEPROT": ("flybase_prot", "prot_id"), + } + ) + for k in ["PFAM", "IPI", "PROSITE"]: + mapping.pop(k, None) + + elif species == "Danio rerio": + mapping["ZFIN"] = ("zfin", "zfin_id") + for k in ["MAP", "GENETYPE"]: + mapping.pop(k, None) + + elif species == "Escherichia coli": + for k in [ + "CHR", + "MAP", + "GENETYPE", + "CHRLOC", + "CHRLOCEND", + "CHRLOCCHR", + "PFAM", + "IPI", + "PROSITE", + "ENSEMBL", + "ENSEMBLPROT", + "ENSEMBLTRANS", + "UNIPROT", + ]: + mapping.pop(k, None) + + elif species == "Gallus gallus": + mapping.pop("MAP", None) + + elif species == "Homo sapiens": + mapping["OMIM"] = ("omim", "omim_id") + mapping["UCSCKG"] = ("ucsc", "ucsc_id") + + elif species == "Mus musculus": + mapping["MGI"] = ("mgi", "mgi_id") + mapping.pop("MAP", None) + + elif species == "Macaca mulatta": + for k in ["ALIAS", "ALIAS2PROBE", "MAP", "PFAM", "IPI", "PROSITE"]: + mapping.pop(k, None) + + elif species == "Plasmodium falciparum": + mapping["ORF"] = ("genes", "gene_id") + # Drops + for k in [ + "ENTREZID", + "ACCNUM", + "ALIAS", + "ALIAS2PROBE", + "ALIAS2EG", + "CHR", + "CHRLOC", + "CHRLOCEND", + "CHRLOCCHR", + "GENETYPE", + "MAP", + "PMID", + "REFSEQ", + "PFAM", + "IPI", + "PROSITE", + "ENSEMBL", + "ENSEMBLPROT", + "ENSEMBLTRANS", + "UNIPROT", + ]: + mapping.pop(k, None) + mapping["ALIAS"] = ("alias", "alias_symbol") + + elif species == "Pan troglodytes": + for k in ["ALIAS", "ALIAS2PROBE", "MAP", "GENETYPE", "PFAM", "IPI", "PROSITE"]: + mapping.pop(k, None) + + elif species == "Rattus norvegicus": + mapping.pop("MAP", None) + + elif species == "Saccharomyces cerevisiae": + mapping.update( + { + "ORF": ("gene2systematic", "systematic_name"), + "DESCRIPTION": ("chromosome_features", "feature_description"), + "COMMON": ("gene2systematic", "gene_name"), + "INTERPRO": ("interpro", "interpro_id"), + "SMART": ("smart", "smart_id"), + "SGD": ("sgd", "sgd_id"), + } + ) + for k in [ + "ACCNUM", + "MAP", + "SYMBOL", + "GENETYPE", + "PROSITE", + "ALIAS", + "ALIAS2EG", + "ALIAS2PROBE", + "CHRLOC", + "CHRLOCEND", + "CHRLOCCHR", + "GENENAME", + "IPI", + "CHR", + ]: + mapping.pop(k, None) + mapping.update( + { + "ALIAS": ("gene2alias", "alias"), + "CHRLOC": ("chromosome_features", "start"), + "CHRLOCEND": ("chromosome_features", "stop"), + "CHRLOCCHR": ("chromosome_features", "chromosome"), + "GENENAME": ("sgd", "gene_name"), + "CHR": ("chromosome_features", "chromosome"), + } + ) + + elif species == "Sus scrofa": + for k in [ + "MAP", + "CHRLOC", + "CHRLOCEND", + "CHRLOCCHR", + "PFAM", + "IPI", + "PROSITE", + "ENSEMBL", + "ENSEMBLPROT", + "ENSEMBLTRANS", + ]: + mapping.pop(k, None) + + elif species == "Xenopus laevis": + for k in [ + "ALIAS", + "ALIAS2PROBE", + "MAP", + "CHRLOC", + "CHRLOCEND", + "CHRLOCCHR", + "PFAM", + "IPI", + "PROSITE", + "ENSEMBL", + "ENSEMBLPROT", + "ENSEMBLTRANS", + ]: + mapping.pop(k, None) + + stock_species = [ + "Anopheles gambiae", + "Arabidopsis thaliana", + "Bos taurus", + "Caenorhabditis elegans", + "Canis familiaris", + "Drosophila melanogaster", + "Danio rerio", + "Escherichia coli", + "Gallus gallus", + "Homo sapiens", + "Mus musculus", + "Macaca mulatta", + "Plasmodium falciparum", + "Pan troglodytes", + "Rattus norvegicus", + "Saccharomyces cerevisiae", + "Sus scrofa", + "Xenopus laevis", + ] + + if species not in stock_species: + mapping = { + "ENTREZID": ("genes", "gene_id"), + "ACCNUM": ("accessions", "accession"), + "ALIAS": ("alias", "alias_symbol"), + "ALIAS2EG": ("alias", "alias_symbol"), + "ALIAS2PROBE": ("alias", "alias_symbol"), + "CHR": ("chromosomes", "chromosome"), + "PMID": ("pubmed", "pubmed_id"), + "REFSEQ": ("refseq", "accession"), + "SYMBOL": ("gene_info", "symbol"), + "GENETYPE": ("genetype", "gene_type"), + "GENENAME": ("gene_info", "gene_name"), + "GO": ("go", "go_id"), + "EVIDENCE": ("go", "evidence"), + "ONTOLOGY": ("go", "ontology"), + } + + if db_class == "GODb": + mapping = { + "GOID": ("go_term", "go_id"), + "TERM": ("go_term", "term"), + "ONTOLOGY": ("go_term", "ontology"), + "DEFINITION": ("go_term", "definition"), + } + + return mapping + + def columns(self) -> List[str]: + """List all available columns/keytypes.""" + return list(self._table_map.keys()) + + def keytypes(self) -> List[str]: + """List all available keytypes (same as columns).""" + return self.columns() + + def keys(self, keytype: str) -> List[str]: + """Return keys for the given keytype.""" + if keytype not in self._table_map: + raise ValueError(f"Invalid keytype: {keytype}. Use columns() to see valid options.") + + table, field = self._table_map[keytype] + query = f"SELECT DISTINCT {field} FROM {table}" + + # check if table exists or let sqlite fail + try: + bf = self._query_as_biocframe(query) + if bf.shape[0] > 0: + return [str(x) for x in bf.get_column(field)] + return [] + except sqlite3.OperationalError: + return [] + + def _expand_cols(self, cols: List[str]) -> List[str]: + """Expand columns like GO into GO, EVIDENCE, ONTOLOGY.""" + new_cols = [] + for c in cols: + new_cols.append(c) + if c == "GO": + if "EVIDENCE" not in new_cols: + new_cols.append("EVIDENCE") + if "ONTOLOGY" not in new_cols: + new_cols.append("ONTOLOGY") + if c == "CHRLOC": + if "CHRLOCCHR" not in new_cols: + new_cols.append("CHRLOCCHR") + return list(set(new_cols)) # remove duplicates + + def select(self, keys: Union[List[str], str], columns: Union[List[str], str], keytype: str) -> BiocFrame: + """Retrieve data from the database. + + Args: + keys: + A list of keys to select. + + columns: + List of columns to retrieve. + + keytype: + The type of the provided keys (must be one of columns()). + """ + if isinstance(keys, str): + keys = [keys] + + if isinstance(columns, str): + columns = [columns] + + if keytype not in self._table_map: + raise ValueError(f"Invalid keytype: {keytype}") + + req_cols = columns + [keytype] + req_cols = self._expand_cols(req_cols) + + tables_needed = set() + fields_to_select = [] + + for col in req_cols: + if col not in self._table_map: + continue + t, f = self._table_map[col] + tables_needed.add(t) + fields_to_select.append(f"{t}.{f} AS {col}") + + base_table = "genes" + kt_table, kt_field = self._table_map[keytype] + select_clause = ", ".join(fields_to_select) + joins = [] + sorted_tables = sorted(list(tables_needed)) + + if kt_table not in tables_needed: + pass + + from_clause = f"FROM {base_table}" + + for t in sorted_tables: + if t == base_table: + continue + joins.append(f"LEFT JOIN {t} USING (_id)") + + if kt_table != base_table and kt_table not in sorted_tables: + joins.append(f"LEFT JOIN {kt_table} USING (_id)") + + join_clause = " ".join(joins) + + placeholders = ",".join("?" * len(keys)) + where_clause = f"WHERE {kt_table}.{kt_field} IN ({placeholders})" + + sql = f"SELECT {select_clause} {from_clause} {join_clause} {where_clause}" + + return self._query_as_biocframe(sql, tuple(keys)) + + def mapIds( + self, keys: Union[List[str], str], column: str, keytype: str, multiVals: str = "first" + ) -> Union[dict, list]: + """Map keys to a specific column. A wrapper around select. + + Args: + keys: + Keys to map. + + column: + The column to map to. + + keytype: + The ID type of the keys. + + multiVals: + How to handle multiple values ('first', 'list', 'filter'). + """ + bf = self.select(keys, [column], keytype) + + kt_data = bf.get_column(keytype) + col_data = bf.get_column(column) + + res = {} + for k, v in zip(kt_data, col_data): + k = str(k) + if k not in res: + res[k] = [] + if v is not None: + res[k].append(v) + + final_res = {} + for k in keys: + k = str(k) + vals = res.get(k, []) + + if multiVals == "first": + final_res[k] = vals[0] if vals else None + elif multiVals == "list": + final_res[k] = vals + elif multiVals == "filter": + if len(vals) == 1: + final_res[k] = vals[0] + else: + final_res[k] = vals[0] if vals else None + + if multiVals == "list": + return final_res + return final_res + + def genes(self) -> GenomicRanges: + """Retrieve gene locations as GenomicRanges. + + Requires 'chromosome_locations' table in the DB. + """ + try: + self._query_as_biocframe("SELECT 1 FROM chromosome_locations LIMIT 1") + except sqlite3.OperationalError: + return GenomicRanges.empty() + + query = """ + SELECT + g.gene_id, + c.seqname, + c.start_location, + c.end_location + FROM genes g + JOIN chromosome_locations c ON g._id = c._id + """ + + bf = self._query_as_biocframe(query) + + if bf.shape[0] == 0: + return GenomicRanges.empty() + + names = [str(x) for x in bf.get_column("gene_id")] + seqnames = [str(x) for x in bf.get_column("seqname")] + starts = bf.get_column("start_location") + ends = bf.get_column("end_location") + + widths = [abs(e - s) + 1 for s, e in zip(starts, ends)] + strand = ["*"] * len(names) + + ranges = IRanges(start=starts, width=widths) + mcols = BiocFrame({"gene_id": names}, row_names=names) + + return GenomicRanges(seqnames=seqnames, ranges=ranges, strand=strand, names=names, mcols=mcols) + + def close(self): + """Close the database connection.""" + self.conn.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() diff --git a/src/orgdb/orgdbregistry.py b/src/orgdb/orgdbregistry.py new file mode 100644 index 0000000..5af9b68 --- /dev/null +++ b/src/orgdb/orgdbregistry.py @@ -0,0 +1,217 @@ +import os +import sqlite3 +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +from pybiocfilecache import BiocFileCache + +from ._ahub import AHUB_METADATA_URL +from .orgdb import OrgDb +from .record import OrgDbRecord + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +class OrgDbRegistry: + """Registry for OrgDb resources, dynamically populated from AnnotationHub.""" + + def __init__( + self, + cache_dir: Optional[Union[str, Path]] = None, + force: bool = False, + ) -> None: + """Initialize the OrgDb registry. + + Args: + cache_dir: + Directory for the BiocFileCache database and cached files. + If None, defaults to "~/.cache/orgdb_bfc". + + force: + If True, force re-download of the AnnotationHub metadata database. + """ + if cache_dir is None: + cache_dir = Path.home() / ".cache" / "orgdb_bfc" + + self._cache_dir = Path(cache_dir) + self._cache_dir.mkdir(parents=True, exist_ok=True) + self._bfc = BiocFileCache(self._cache_dir) + + self._registry_map: Dict[str, OrgDbRecord] = {} + + self._initialize_registry(force=force) + + def _initialize_registry(self, force: bool = False): + """Fetch the AnnotationHub metadata and populate the registry.""" + rname = "annotationhub_metadata" + + existing = None + try: + existing = self._bfc.get(rname) + except Exception: + pass + + if force and existing: + try: + self._bfc.remove(rname) + except Exception: + pass + existing = None + + if existing: + md_resource = existing + else: + md_resource = self._bfc.add(rname, AHUB_METADATA_URL, rtype="web") + + md_path = self._get_filepath(md_resource) + + if not md_path or not os.path.exists(md_path): + if existing and not force: + return self._initialize_registry(force=True) + + raise RuntimeError("Failed to retrieve AnnotationHub metadata database.") + + conn = sqlite3.connect(md_path) + try: + query = """ + SELECT + r.title, + r.rdatadateadded, + lp.location_prefix || rp.rdatapath AS full_rdatapath + FROM resources r + LEFT JOIN location_prefixes lp + ON r.location_prefix_id = lp.id + LEFT JOIN rdatapaths rp + ON rp.resource_id = r.id + WHERE r.title LIKE 'org.%.sqlite' + ORDER BY r.rdatadateadded DESC; + """ + cursor = conn.cursor() + cursor.execute(query) + rows = cursor.fetchall() + finally: + conn.close() + + for title, date_added, url in rows: + if title.endswith(".sqlite"): + orgdb_id = title[:-7] + else: + orgdb_id = title + + if orgdb_id in self._registry_map: + continue + + entry = {"url": url, "release_date": str(date_added).split(" ")[0] if date_added else None} + + record = OrgDbRecord.from_config_entry(orgdb_id, entry) + self._registry_map[orgdb_id] = record + + def list_orgdb(self) -> List[str]: + """List all available OrgDb IDs (e.g., 'org.Hs.eg.db'). + + Returns: + A sorted list of valid OrgDb ID strings. + """ + return sorted(list(self._registry_map.keys())) + + def get_record(self, orgdb_id: str) -> OrgDbRecord: + """Get the metadata record for a given OrgDb ID. + + Args: + orgdb_id: + The OrgDb ID to look up (e.g., 'org.Hs.eg.db'). + + Returns: + A OrgDbRecord object containing metadata. + + Raises: + KeyError: If the ID is not found. + """ + if orgdb_id not in self._registry_map: + raise KeyError(f"OrgDb ID '{orgdb_id}' not found in registry.") + + return self._registry_map[orgdb_id] + + def download(self, orgdb_id: str, force: bool = False) -> str: + """Download and cache the OrgDb file. + + Args: + orgdb_id: + The OrgDb ID to fetch. + + force: + If True, forces re-download even if already cached. + + Returns: + Local filesystem path to the cached file. + """ + record = self.get_record(orgdb_id) + url = record.url + key = orgdb_id + + if force: + try: + self._bfc.remove(key) + except Exception: + pass + + # Check if already exists + if not force: + try: + existing = self._bfc.get(key) + if existing: + path = self._get_filepath(existing) + if path and os.path.exists(path) and os.path.getsize(path) > 0: + return path + except Exception: + pass + + # Add/Download + resource = self._bfc.add( + key, + url, + rtype="web", + download=True, + ) + + path = self._get_filepath(resource) + + # Validation + if not path or not os.path.exists(path) or os.path.getsize(path) == 0: + # Cleanup bad download + try: + self._bfc.remove(key) + except Exception: + pass + raise RuntimeError(f"Download failed for {orgdb_id}. File is empty or missing.") + + return path + + def load_db(self, orgdb_id: str, force: bool = False) -> OrgDb: + """Load an OrgDb object for the given ID. + + Args: + orgdb_id: + The ID of the OrgDb to load. + + force: + If True, forces re-download of the database file. + + Returns: + An initialized OrgDb object. + """ + path = self.download(orgdb_id, force=force) + return OrgDb(path) + + def _get_filepath(self, resource: Any) -> Optional[str]: + """Helper to extract absolute path from a BiocFileCache resource.""" + if hasattr(resource, "rpath"): + rel_path = str(resource.rpath) + elif hasattr(resource, "get"): + rel_path = str(resource.get("rpath")) + else: + return None + + return str(self._cache_dir / rel_path) diff --git a/src/orgdb/record.py b/src/orgdb/record.py new file mode 100644 index 0000000..423c76d --- /dev/null +++ b/src/orgdb/record.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +from dataclasses import dataclass +from datetime import date, datetime +from typing import Optional + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@dataclass(frozen=True) +class OrgDbRecord: + """Container for a single OrgDb entry.""" + + orgdb_id: str + release_date: Optional[date] + url: str + + species: Optional[str] = None # e.g. "Hs" or "Hsapiens" + id_type: Optional[str] = None # e.g. "eg" (Entrez Gene) or "tair" + + bioc_version: Optional[str] = None + + @classmethod + def from_config_entry(cls, orgdb_id: str, entry: dict) -> "OrgDbRecord": + """Build a record from a ORGDB_CONFIG entry: + { + "release_date": "YYYY-MM-DD", # optional + "url": "https://..." + } + """ + url = entry["url"] + + date_str = entry.get("release_date") + rel_date: Optional[date] + if date_str: + rel_date = datetime.strptime(date_str, "%Y-%m-%d").date() + else: + rel_date = None + + species, id_type = _parse_orgdb_id(orgdb_id) + bioc_version = _parse_bioc_version(url) + + return cls( + orgdb_id=orgdb_id, + release_date=rel_date, + url=url, + species=species, + id_type=id_type, + bioc_version=bioc_version, + ) + + +def _parse_orgdb_id(orgdb_id: str): + """Parse IDs like: + org.Hs.eg.db + org.At.tair.db + into (species, id_type). + """ + name = orgdb_id + if name.startswith("org."): + name = name[len("org.") :] + + if name.endswith(".db"): + name = name[: -len(".db")] + + if name.endswith(".sqlite"): + name = name[: -len(".sqlite")] + + parts = name.split(".") + + if len(parts) < 2: + return None, None + + species = parts[0] + id_type = parts[1] + + return species, id_type + + +def _parse_bioc_version(url: str) -> Optional[str]: + """Extract the Bioconductor/AnnotationHub-like version from URL. + + Example: + .../standard/3.19/org.Hs.eg.sqlite -> "3.19" + """ + parts = url.rstrip("/").split("/") + if len(parts) < 2: + return None + + candidate = parts[-2] + + if candidate.replace(".", "").isdigit(): + return candidate + return None diff --git a/src/orgdb/skeleton.py b/src/orgdb/skeleton.py deleted file mode 100644 index d8a10db..0000000 --- a/src/orgdb/skeleton.py +++ /dev/null @@ -1,149 +0,0 @@ -""" -This is a skeleton file that can serve as a starting point for a Python -console script. To run this script uncomment the following lines in the -``[options.entry_points]`` section in ``setup.cfg``:: - - console_scripts = - fibonacci = orgdb.skeleton:run - -Then run ``pip install .`` (or ``pip install -e .`` for editable mode) -which will install the command ``fibonacci`` inside your current environment. - -Besides console scripts, the header (i.e. until ``_logger``...) of this file can -also be used as template for Python modules. - -Note: - This file can be renamed depending on your needs or safely removed if not needed. - -References: - - https://setuptools.pypa.io/en/latest/userguide/entry_point.html - - https://pip.pypa.io/en/stable/reference/pip_install -""" - -import argparse -import logging -import sys - -from orgdb import __version__ - -__author__ = "Jayaram Kancherla" -__copyright__ = "Jayaram Kancherla" -__license__ = "MIT" - -_logger = logging.getLogger(__name__) - - -# ---- Python API ---- -# The functions defined in this section can be imported by users in their -# Python scripts/interactive interpreter, e.g. via -# `from orgdb.skeleton import fib`, -# when using this Python module as a library. - - -def fib(n): - """Fibonacci example function - - Args: - n (int): integer - - Returns: - int: n-th Fibonacci number - """ - assert n > 0 - a, b = 1, 1 - for _i in range(n - 1): - a, b = b, a + b - return a - - -# ---- CLI ---- -# The functions defined in this section are wrappers around the main Python -# API allowing them to be called directly from the terminal as a CLI -# executable/script. - - -def parse_args(args): - """Parse command line parameters - - Args: - args (List[str]): command line parameters as list of strings - (for example ``["--help"]``). - - Returns: - :obj:`argparse.Namespace`: command line parameters namespace - """ - parser = argparse.ArgumentParser(description="Just a Fibonacci demonstration") - parser.add_argument( - "--version", - action="version", - version=f"orgdb {__version__}", - ) - parser.add_argument(dest="n", help="n-th Fibonacci number", type=int, metavar="INT") - parser.add_argument( - "-v", - "--verbose", - dest="loglevel", - help="set loglevel to INFO", - action="store_const", - const=logging.INFO, - ) - parser.add_argument( - "-vv", - "--very-verbose", - dest="loglevel", - help="set loglevel to DEBUG", - action="store_const", - const=logging.DEBUG, - ) - return parser.parse_args(args) - - -def setup_logging(loglevel): - """Setup basic logging - - Args: - loglevel (int): minimum loglevel for emitting messages - """ - logformat = "[%(asctime)s] %(levelname)s:%(name)s:%(message)s" - logging.basicConfig( - level=loglevel, stream=sys.stdout, format=logformat, datefmt="%Y-%m-%d %H:%M:%S" - ) - - -def main(args): - """Wrapper allowing :func:`fib` to be called with string arguments in a CLI fashion - - Instead of returning the value from :func:`fib`, it prints the result to the - ``stdout`` in a nicely formatted message. - - Args: - args (List[str]): command line parameters as list of strings - (for example ``["--verbose", "42"]``). - """ - args = parse_args(args) - setup_logging(args.loglevel) - _logger.debug("Starting crazy calculations...") - print(f"The {args.n}-th Fibonacci number is {fib(args.n)}") - _logger.info("Script ends here") - - -def run(): - """Calls :func:`main` passing the CLI arguments extracted from :obj:`sys.argv` - - This function can be used as entry point to create console scripts with setuptools. - """ - main(sys.argv[1:]) - - -if __name__ == "__main__": - # ^ This is a guard statement that will prevent the following code from - # being executed in the case someone imports this file instead of - # executing it as a script. - # https://docs.python.org/3/library/__main__.html - - # After installing your project with pip, users can also run your Python - # modules as scripts via the ``-m`` flag, as defined in PEP 338:: - # - # python -m orgdb.skeleton 42 - # - run() diff --git a/tests/conftest.py b/tests/conftest.py index 844d350..c2998c2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,4 +7,99 @@ - https://docs.pytest.org/en/stable/writing_plugins.html """ -# import pytest +import sqlite3 +from orgdb import OrgDb +import pytest + +@pytest.fixture +def mock_orgdb_path(tmp_path): + """Create a temporary SQLite file with a standard OrgDb schema and sample data.""" + db_path = tmp_path / "mock_org.sqlite" + conn = sqlite3.connect(db_path) + + # 1. metadata table + conn.execute("CREATE TABLE metadata (name VARCHAR(80) PRIMARY KEY, value VARCHAR(255))") + conn.execute("INSERT INTO metadata VALUES ('ORGANISM', 'Homo sapiens')") + conn.execute("INSERT INTO metadata VALUES ('DBSCHEMA', 'H. sapiens')") + + # 2. genes table (Central table) + # _id is internal PK, gene_id is Entrez ID + conn.execute(""" + CREATE TABLE genes ( + _id INTEGER PRIMARY KEY, + gene_id VARCHAR(10) NOT NULL UNIQUE + ) + """) + # Sample Genes: + # 1: 1 (A1BG) + # 2: 10 (NAT2) + # 3: 100 (ADA) + conn.execute("INSERT INTO genes VALUES (1, '1')") + conn.execute("INSERT INTO genes VALUES (2, '10')") + conn.execute("INSERT INTO genes VALUES (3, '100')") + + # 3. gene_info table (Symbol, Gene Name) + conn.execute(""" + CREATE TABLE gene_info ( + _id INTEGER NOT NULL, + gene_name VARCHAR(255) NOT NULL, + symbol VARCHAR(80) NOT NULL, + FOREIGN KEY (_id) REFERENCES genes (_id) + ) + """) + conn.execute("INSERT INTO gene_info VALUES (1, 'Alpha-1-B glycoprotein', 'A1BG')") + conn.execute("INSERT INTO gene_info VALUES (2, 'N-acetyltransferase 2', 'NAT2')") + conn.execute("INSERT INTO gene_info VALUES (3, 'Adenosine deaminase', 'ADA')") + + # 4. go table (GO terms) + # Note: A gene can have multiple GO terms + conn.execute(""" + CREATE TABLE go ( + _id INTEGER NOT NULL, + go_id CHAR(10) NOT NULL, + evidence CHAR(3) NOT NULL, + ontology CHAR(10) NOT NULL, + FOREIGN KEY (_id) REFERENCES genes (_id) + ) + """) + # Gene 1 (A1BG) -> GO:0000001 (CC), GO:0000002 (MF) + conn.execute("INSERT INTO go VALUES (1, 'GO:0000001', 'IEA', 'CC')") + conn.execute("INSERT INTO go VALUES (1, 'GO:0000002', 'TAS', 'MF')") + # Gene 2 (NAT2) -> GO:0000003 (BP) + conn.execute("INSERT INTO go VALUES (2, 'GO:0000003', 'IMP', 'BP')") + + # 5. chromosome_locations table (Genomic coordinates) + conn.execute(""" + CREATE TABLE chromosome_locations ( + _id INTEGER NOT NULL, + seqname VARCHAR(20) NOT NULL, + start_location INTEGER NOT NULL, + end_location INTEGER NOT NULL, + FOREIGN KEY (_id) REFERENCES genes (_id) + ) + """) + # A1BG on chr19 + conn.execute("INSERT INTO chromosome_locations VALUES (1, 'chr19', 58346806, 58353492)") + # NAT2 on chr8 + conn.execute("INSERT INTO chromosome_locations VALUES (2, 'chr8', 18248755, 18258723)") + + # 6. alias table (Aliases) + conn.execute(""" + CREATE TABLE alias ( + _id INTEGER NOT NULL, + alias_symbol VARCHAR(80) NOT NULL, + FOREIGN KEY (_id) REFERENCES genes (_id) + ) + """) + conn.execute("INSERT INTO alias VALUES (2, 'AAC2')") + + conn.commit() + conn.close() + return str(db_path) + +@pytest.fixture +def mock_orgdb(mock_orgdb_path): + """Return an open OrgDb instance using the mock database.""" + db = OrgDb(mock_orgdb_path) + yield db + db.close() \ No newline at end of file diff --git a/tests/test_orgdb.py b/tests/test_orgdb.py new file mode 100644 index 0000000..fe7ca4a --- /dev/null +++ b/tests/test_orgdb.py @@ -0,0 +1,92 @@ +from biocframe import BiocFrame +from genomicranges import GenomicRanges +import pytest +from orgdb import OrgDb + +def test_orgdb_init(mock_orgdb): + assert isinstance(mock_orgdb, OrgDb) + assert mock_orgdb.conn is not None + +def test_species(mock_orgdb): + assert mock_orgdb.species == "Homo sapiens" + +def test_columns(mock_orgdb): + cols = mock_orgdb.columns() + assert "ENTREZID" in cols + assert "SYMBOL" in cols + assert "GO" in cols + assert "GENENAME" in cols + +def test_keys(mock_orgdb): + keys = mock_orgdb.keys("ENTREZID") + assert "1" in keys + assert "10" in keys + assert "100" in keys + assert len(keys) == 3 + + syms = mock_orgdb.keys("SYMBOL") + assert "A1BG" in syms + assert "ADA" in syms + + with pytest.raises(ValueError): + mock_orgdb.keys("INVALID_TYPE") + +def test_select_simple(mock_orgdb): + res = mock_orgdb.select(keys="1", columns=["SYMBOL"], keytype="ENTREZID") + assert isinstance(res, BiocFrame) + assert len(res) == 1 + assert res.get_column("ENTREZID")[0] == "1" + assert res.get_column("SYMBOL")[0] == "A1BG" + +def test_select_multikey(mock_orgdb): + res = mock_orgdb.select(keys=["1", "10"], columns=["SYMBOL"], keytype="ENTREZID") + assert len(res) == 2 + + symbols = res.get_column("SYMBOL") + assert "A1BG" in symbols + assert "NAT2" in symbols + +def test_select_go_expansion(mock_orgdb): + res = mock_orgdb.select(keys="1", columns=["GO"], keytype="ENTREZID") + + col_names = list(res.column_names) + assert "GO" in col_names + assert "EVIDENCE" in col_names + assert "ONTOLOGY" in col_names + + assert len(res) == 2 + go_ids = res.get_column("GO") + assert "GO:0000001" in go_ids + assert "GO:0000002" in go_ids + +def test_select_many_to_one(mock_orgdb): + res = mock_orgdb.select(keys="AAC2", columns=["ENTREZID"], keytype="ALIAS") + assert len(res) == 1 + assert res.get_column("ENTREZID")[0] == "10" + +def test_mapIds(mock_orgdb): + keys = ["1", "10", "100"] + + res = mock_orgdb.mapIds(keys, column="SYMBOL", keytype="ENTREZID") + assert isinstance(res, dict) + assert res["1"] == "A1BG" + assert res["10"] == "NAT2" + + res_list = mock_orgdb.mapIds(["1"], column="GO", keytype="ENTREZID", multiVals="list") + assert isinstance(res_list["1"], list) + assert len(res_list["1"]) == 2 + assert "GO:0000001" in res_list["1"] + +def test_genes_genomicranges(mock_orgdb): + gr = mock_orgdb.genes() + assert isinstance(gr, GenomicRanges) + assert len(gr) == 2 + + names = list(gr.names) + idx = names.index("1") + assert str(gr.seqnames[idx]) == "chr19" + assert gr.start[idx] == 58346806 + assert gr.end[idx] == 58353492 + + assert "gene_id" in gr.mcols.column_names + assert gr.mcols.get_column("gene_id")[idx] == "1" \ No newline at end of file diff --git a/tests/test_real.py b/tests/test_real.py new file mode 100644 index 0000000..089b069 --- /dev/null +++ b/tests/test_real.py @@ -0,0 +1,26 @@ +from orgdb import OrgDb, OrgDbRegistry + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def test_real_orgdb_workflow(tmp_path): + registry = OrgDbRegistry(cache_dir=tmp_path / "cache") + orgdb_id = "org.Mm.eg.db" + + assert orgdb_id in registry.list_orgdb() + + orgdb = registry.load_db(orgdb_id, force=True) + assert isinstance(orgdb, OrgDb) + + res = orgdb.select( + keytype="GO", + keys=[ + "GO:0048709", # + "GO:0048699", + "GO:0048143"], + columns="SYMBOL") + + assert res.shape == (104, 4) + orgdb.close() diff --git a/tests/test_registry.py b/tests/test_registry.py new file mode 100644 index 0000000..57ca98f --- /dev/null +++ b/tests/test_registry.py @@ -0,0 +1,22 @@ +import pytest +from orgdb import OrgDbRegistry + +@pytest.fixture +def registry(tmp_path): + """Initialize registry with a temp cache dir.""" + return OrgDbRegistry(cache_dir=tmp_path / "cache") + +def test_registry_init(registry): + assert isinstance(registry, OrgDbRegistry) + assert "org.Hs.eg.db" in registry.list_orgdb() + +def test_get_record(registry): + rec = registry.get_record("org.Hs.eg.db") + assert rec.orgdb_id == "org.Hs.eg.db" + assert rec.species == "Hs" + assert rec.id_type == "eg" + assert rec.url.endswith("org.Hs.eg.db.sqlite") + +def test_invalid_id(registry): + with pytest.raises(KeyError): + registry.get_record("org.Invalid.db") diff --git a/tests/test_skeleton.py b/tests/test_skeleton.py deleted file mode 100644 index f54c55d..0000000 --- a/tests/test_skeleton.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest - -from orgdb.skeleton import fib, main - -__author__ = "Jayaram Kancherla" -__copyright__ = "Jayaram Kancherla" -__license__ = "MIT" - - -def test_fib(): - """API Tests""" - assert fib(1) == 1 - assert fib(2) == 1 - assert fib(7) == 13 - with pytest.raises(AssertionError): - fib(-10) - - -def test_main(capsys): - """CLI Tests""" - # capsys is a pytest fixture that allows asserts against stdout/stderr - # https://docs.pytest.org/en/stable/capture.html - main(["7"]) - captured = capsys.readouterr() - assert "The 7-th Fibonacci number is 13" in captured.out