RECETOX · hechth · Jun 25, 2025 · Jun 9, 2025 · Jun 9, 2025 · Jun 9, 2025
diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml
@@ -12,21 +12,24 @@ jobs:
       max-parallel: 5
 
     steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python 3.12
-      uses: actions/setup-python@v3
-      with:
-        python-version: 3.12
-    - name: Install dependencies
-      run: |
-        pip install poetry
-        poetry install --with=dev
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-    - name: Test with pytest
-      run: |
-        poetry run pytest --cov --cov-report term --cov-report xml --junitxml=xunit-result.xml
+      - uses: actions/checkout@v3
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v3
+        with:
+          python-version: 3.12
+      - name: Install dependencies
+        run: |
+          pip install poetry
+          poetry install --with=dev
+      - name: Lint with ruff
+        run: |
+          poetry run ruff check . --fix --exit-non-zero-on-fix
+      - name: Lint with flake8
+        run: |
+          # stop the build if there are Python syntax errors or undefined names
+          poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+          poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+      - name: Test with pytest
+        run: |
+          poetry run pytest --cov --cov-report term --cov-report xml --junitxml=xunit-result.xml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,51 @@
+exclude: '.*\.(csv|msp)$'
+default_stages: [pre-commit]
+
+default_language_version:
+  python: python3.12
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-json
+      - id: check-toml
+      - id: check-xml
+      - id: check-yaml
+      - id: debug-statements
+      - id: check-builtin-literals
+      - id: check-case-conflict
+      - id: check-docstring-first
+      - id: detect-private-key
+
+  # Ruff linter (primary linter)
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.4.4
+    hooks:
+      - id: ruff
+        args: [--fix, --exit-non-zero-on-fix, .]
+
+  # Flake8 (optional, for CI or legacy)
+  - repo: https://github.com/pycqa/flake8
+    rev: 7.0.0
+    hooks:
+      - id: flake8
+        additional_dependencies: []
+        args: [
+          "--count",
+          "--select=E9,F63,F7,F82",
+          "--show-source",
+          "--statistics"
+        ]
+      - id: flake8
+        name: flake8 (warnings)
+        additional_dependencies: []
+        args: [
+          "--count",
+          "--exit-zero",
+          "--max-complexity=10",
+          "--max-line-length=127",
+          "--statistics"
+        ]
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,106 +1,175 @@
 # Changelog
+
 All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.5.0] - 2025-06-09
+
+### Added
+
+- Added `smiles_to_formula` and `inchi_to_formula` conversions to the RDKit converter.
+- Added unit tests for `smiles_to_formula`.
+- Added support for tabular files, which are treated like tsv files.
+
+### Changed
+
+- Updated `matchms` dependency to `^0.30.0` in `pyproject.toml`.
+
+### Fixed
+
+- Minor documentation and test improvements.
 
 ## [0.4.1] - 2025-05-19
+
 ### Changed
-* updated dependencies [#157](https://github.com/RECETOX/MSMetaEnhancer/pull/157)
+
+- updated dependencies [#157](https://github.com/RECETOX/MSMetaEnhancer/pull/157)
 
 ## [0.4.0] - 2024-03-13
+
 ### Changed
-* Update IDSM SPARQL queries to achieve better performance by @galgonek in [#152](https://github.com/RECETOX/MSMetaEnhancer/pull/152)
-* Switched to poetry and added additional converters by @hechth in [#155](https://github.com/RECETOX/MSMetaEnhancer/pull/155)
 
+- Update IDSM SPARQL queries to achieve better performance by @galgonek in [#152](https://github.com/RECETOX/MSMetaEnhancer/pull/152)
+- Switched to poetry and added additional converters by @hechth in [#155](https://github.com/RECETOX/MSMetaEnhancer/pull/155)
 
 ## [0.3.0] - 2023-05-12
+
 ### Added
-* general class Data for input handling [#141](https://github.com/RECETOX/MSMetaEnhancer/pull/141)
-* DataFrame class to read and handle tabular metadata input [#141](https://github.com/RECETOX/MSMetaEnhancer/pull/141)
-* implementation of blocking time in PubChem [#145](https://github.com/RECETOX/MSMetaEnhancer/pull/145)
+
+- general class Data for input handling [#141](https://github.com/RECETOX/MSMetaEnhancer/pull/141)
+- DataFrame class to read and handle tabular metadata input [#141](https://github.com/RECETOX/MSMetaEnhancer/pull/141)
+- implementation of blocking time in PubChem [#145](https://github.com/RECETOX/MSMetaEnhancer/pull/145)
+
 ### Changed
-* Spectra class is an instantiation of Data class [#141](https://github.com/RECETOX/MSMetaEnhancer/pull/141)
-* fix throttling freezing the app [#144](https://github.com/RECETOX/MSMetaEnhancer/pull/144)
+
+- Spectra class is an instantiation of Data class [#141](https://github.com/RECETOX/MSMetaEnhancer/pull/141)
+- fix throttling freezing the app [#144](https://github.com/RECETOX/MSMetaEnhancer/pull/144)
+
 ### Removed
-* retired NLM (ChemIDplus) service [#140](https://github.com/RECETOX/MSMetaEnhancer/pull/140)
+
+- retired NLM (ChemIDplus) service [#140](https://github.com/RECETOX/MSMetaEnhancer/pull/140)
 
 ## [0.2.5] - 2022-10-15
+
 ### Added
-* added Pytest config file `pytest.ini` and set it to automatically detect asynchronous tests [#124](https://github.com/RECETOX/MSMetaEnhancer/pull/124)
+
+- added Pytest config file `pytest.ini` and set it to automatically detect asynchronous tests [#124](https://github.com/RECETOX/MSMetaEnhancer/pull/124)
+
 ### Changed
-* fixed Circuit Breaker implementation to be compatible with Python 3.9 [#124](https://github.com/RECETOX/MSMetaEnhancer/pull/124)
+
+- fixed Circuit Breaker implementation to be compatible with Python 3.9 [#124](https://github.com/RECETOX/MSMetaEnhancer/pull/124)
+
 ### Removed
 
 ## [0.2.4] - 2022-08-30
+
 ### Changed
-* escaping of single quotes in IDSM arguments [#102](https://github.com/RECETOX/MSMetaEnhancer/issues/102)
-* unified environment and packaging management [#115](https://github.com/RECETOX/MSMetaEnhancer/issues/115)
-* apply circuit breaker pattern in WebConverter [#113](https://github.com/RECETOX/MSMetaEnhancer/issues/113)
+
+- escaping of single quotes in IDSM arguments [#102](https://github.com/RECETOX/MSMetaEnhancer/issues/102)
+- unified environment and packaging management [#115](https://github.com/RECETOX/MSMetaEnhancer/issues/115)
+- apply circuit breaker pattern in WebConverter [#113](https://github.com/RECETOX/MSMetaEnhancer/issues/113)
+
 ### Removed
-* removed test case from curator which fails in matchms > 0.14 [#112](https://github.com/RECETOX/MSMetaEnhancer/issues/112)
+
+- removed test case from curator which fails in matchms > 0.14 [#112](https://github.com/RECETOX/MSMetaEnhancer/issues/112)
 
 ## [0.2.3] - 2022-05-12
+
 ### Added
-* KEGG ID conversions support to BridgeDb service [#101](https://github.com/RECETOX/MSMetaEnhancer/issues/101)
+
+- KEGG ID conversions support to BridgeDb service [#101](https://github.com/RECETOX/MSMetaEnhancer/issues/101)
+
 ### Changed
-* double quotes to single quotes in IDSM [#102](https://github.com/RECETOX/MSMetaEnhancer/issues/102)
+
+- double quotes to single quotes in IDSM [#102](https://github.com/RECETOX/MSMetaEnhancer/issues/102)
 
 ## [0.2.2] - 2022-04-27
+
 ### Added
-* introduced `error` level into logging [#95](https://github.com/RECETOX/MSMetaEnhancer/issues/95)
-* logging of unknown errors in Annotator [#90](https://github.com/RECETOX/MSMetaEnhancer/issues/90) 
+
+- introduced `error` level into logging [#95](https://github.com/RECETOX/MSMetaEnhancer/issues/95)
+- logging of unknown errors in Annotator [#90](https://github.com/RECETOX/MSMetaEnhancer/issues/90)
+
 ### Changed
-* the log file is now written continuously during annotation and the metrics added at the end of the file [#92](https://github.com/RECETOX/MSMetaEnhancer/issues/92)
+
+- the log file is now written continuously during annotation and the metrics added at the end of the file [#92](https://github.com/RECETOX/MSMetaEnhancer/issues/92)
+
 ### Removed
 
 ## [0.2.1] - 2022-04-05
+
 ### Added
-* try-finally block to ensure the Monitor thread is always terminated [#86](https://github.com/RECETOX/MSMetaEnhancer/issues/86)
+
+- try-finally block to ensure the Monitor thread is always terminated [#86](https://github.com/RECETOX/MSMetaEnhancer/issues/86)
+
 ### Changed
-* improved parsing of PubChem responses [#84](https://github.com/RECETOX/MSMetaEnhancer/issues/84)
+
+- improved parsing of PubChem responses [#84](https://github.com/RECETOX/MSMetaEnhancer/issues/84)
 
 ## [0.2.0] - 2022-03-19
+
 ### Added
-* BridgeDb supporting conversion of several database IDs [#76](https://github.com/RECETOX/MSMetaEnhancer/issues/76)
-* ComputeConverter class for conversions based on computation instead of querying [#75](https://github.com/RECETOX/MSMetaEnhancer/issues/75)
-* ConverterBuilder which validates and initialises converters [#75](https://github.com/RECETOX/MSMetaEnhancer/issues/75)
-* reintroduced PubChem service using direct REST web interface [#76](https://github.com/RECETOX/MSMetaEnhancer/issues/76)
+
+- BridgeDb supporting conversion of several database IDs [#76](https://github.com/RECETOX/MSMetaEnhancer/issues/76)
+- ComputeConverter class for conversions based on computation instead of querying [#75](https://github.com/RECETOX/MSMetaEnhancer/issues/75)
+- ConverterBuilder which validates and initialises converters [#75](https://github.com/RECETOX/MSMetaEnhancer/issues/75)
+- reintroduced PubChem service using direct REST web interface [#76](https://github.com/RECETOX/MSMetaEnhancer/issues/76)
+
 ### Changed
-* reorganised Converter class to support computation approach [#75](https://github.com/RECETOX/MSMetaEnhancer/issues/75)
-* renamed PubChem service to IDSM to avoid confusion [#73](https://github.com/RECETOX/MSMetaEnhancer/issues/73)
+
+- reorganised Converter class to support computation approach [#75](https://github.com/RECETOX/MSMetaEnhancer/issues/75)
+- renamed PubChem service to IDSM to avoid confusion [#73](https://github.com/RECETOX/MSMetaEnhancer/issues/73)
 
 ## [0.1.3] - 2022-02-15
+
 ### Added
-* multidict package requirement
-* tracking of attributes validation in log [#68](https://github.com/RECETOX/MSMetaEnhancer/issues/68)
-* CIR: Inchi -> SMILES conversion [#66](https://github.com/RECETOX/MSMetaEnhancer/issues/66)
+
+- multidict package requirement
+- tracking of attributes validation in log [#68](https://github.com/RECETOX/MSMetaEnhancer/issues/68)
+- CIR: Inchi -> SMILES conversion [#66](https://github.com/RECETOX/MSMetaEnhancer/issues/66)
+
 ### Changed
-* passed `multidict` instead of `frozendict` to `aiohttp.ClientSession.post` (required by package)
-* take only first result when there are multiple hits in CIR conversions [#69](https://github.com/RECETOX/MSMetaEnhancer/issues/69)
-* support `ISOMERIC_SMILES` and `CANONICAL_SMILES` in PubChem instead of generic `SMILES` [#67](https://github.com/RECETOX/MSMetaEnhancer/issues/67)
+
+- passed `multidict` instead of `frozendict` to `aiohttp.ClientSession.post` (required by package)
+- take only first result when there are multiple hits in CIR conversions [#69](https://github.com/RECETOX/MSMetaEnhancer/issues/69)
+- support `ISOMERIC_SMILES` and `CANONICAL_SMILES` in PubChem instead of generic `SMILES` [#67](https://github.com/RECETOX/MSMetaEnhancer/issues/67)
 
 ## [0.1.2] - 2022-01-06
+
 ### Added
+
 - `generate_options()` function in `Galaxy` submodule to create all possible conversions supported by the tool in a format suitable for the galaxy tool form [#58](https://github.com/RECETOX/MSMetaEnhancer/pull/58)
 - monitoring of services status during annotation process [#56](https://github.com/RECETOX/MSMetaEnhancer/issues/56)
 - validation of obtained metadata [#59](https://github.com/RECETOX/MSMetaEnhancer/issues/59)
+
 ### Changed
+
 - structure and contents of documentation [#51](https://github.com/RECETOX/MSMetaEnhancer/pull/51)
+
 ### Removed
+
 - tests checking contents and consistency of individual services [#54](https://github.com/RECETOX/MSMetaEnhancer/pull/61)
 
 ## [0.1.1] - 2021-12-07
+
 ### Added
+
 - `get_conversion_functions` on the level of `Converter`
+
 ### Changed
+
 - computation of all available jobs in `Application`
+
 ### Removed
+
 - `get_all_conversions` on the level of `Annotator`
 
 ## [0.1.0] - 2021-11-16
+
 ### Added
+
 - Added conda environment files [#35](https://github.com/RECETOX/MSMetaEnhancer/pull/35)
 - Usage of IDSM SPARQL for PubChem service [#25](https://github.com/RECETOX/MSMetaEnhancer/pull/25)
 - Added logging and quantitative progress of annotation process [#22](https://github.com/RECETOX/MSMetaEnhancer/pull/22)

diff --git a/MSMetaEnhancer/app.py b/MSMetaEnhancer/app.py
@@ -8,7 +8,7 @@
 from MSMetaEnhancer.libs.data import Spectra, DataFrame
 from MSMetaEnhancer.libs.utils import logger
 from MSMetaEnhancer.libs.utils.ConverterBuilder import ConverterBuilder
-from MSMetaEnhancer.libs.utils.Errors import UnknownSpectraFormat
+from MSMetaEnhancer.libs.utils.Errors import UnknownFileFormat
 from MSMetaEnhancer.libs.utils.Job import convert_to_jobs
 from MSMetaEnhancer.libs.utils.Monitor import Monitor
 
@@ -27,10 +27,10 @@ def load_data(self, filename, file_format):
         """
         if file_format in ['msp', 'mgf', 'json']:
             self.data = Spectra()
-        elif file_format in ['csv', 'tsv', 'xlsx']:
+        elif file_format in ['csv', 'tsv', 'tabular', 'xlsx']:
             self.data = DataFrame()
         else:
-            raise UnknownSpectraFormat(f'Format {file_format} not supported.')
+            raise UnknownFileFormat(f'Format {file_format} not supported.')
         self.data.load_data(filename, file_format)
 
     def save_data(self, filename, file_format):

diff --git a/MSMetaEnhancer/libs/Curator.py b/MSMetaEnhancer/libs/Curator.py
@@ -1,6 +1,8 @@
 from matchms.filtering.filter_utils.smile_inchi_inchikey_conversions import (
     is_valid_smiles, is_valid_inchi, is_valid_inchikey
 )
+from MSMetaEnhancer.libs.utils.Errors import InvalidAttributeFormat
+
 
 # Example usage
 smiles = "C1=CC=CC=C1"
@@ -10,7 +12,6 @@
 print(is_valid_smiles(smiles))  # True if valid SMILES
 print(is_valid_inchi(inchi))    # True if valid InChI
 print(is_valid_inchikey(inchikey))  # True if valid InChIKey
-from MSMetaEnhancer.libs.utils.Errors import InvalidAttributeFormat
 
 
 class Curator:

diff --git a/MSMetaEnhancer/libs/converters/compute/RDKit.py b/MSMetaEnhancer/libs/converters/compute/RDKit.py
@@ -2,6 +2,7 @@
 from rdkit.Chem.Descriptors import ExactMolWt
 from rdkit.Chem import MolFromSmiles, MolToSmiles
 from rdkit.Chem.inchi import MolFromInchi
+from rdkit.Chem.rdMolDescriptors import CalcMolFormula
 from rdkit.Chem import Atom
 
 
@@ -68,3 +69,31 @@ def formula_to_mw(self, formula):
             multiplier = int(parts[index + 1]) if len(parts) > index + 1 and parts[index + 1].isnumeric() else 1
             mass += atom.GetMass() * multiplier
         return {'mw': mass}
+
+    def smiles_to_formula(self, smiles: str) -> dict:
+        """
+        Compute molecular formula from SMILES.
+
+        :param smiles: given SMILES
+        :return: computed molecular formula
+        """
+        mol = MolFromSmiles(smiles)
+        if mol is None:
+            return {'formula': ''}
+
+        formula = CalcMolFormula(mol)
+
+        return {'formula': formula}
+
+    def inchi_to_formula(self, inchi: str) -> dict:
+        """
+        Compute molecular formula from InChI.
+
+        :param inchi: given InChI
+        :return: computed molecular formula
+        """
+        mol = MolFromInchi(inchi)
+        if mol is None:
+            return {'formula': ''}
+        formula = CalcMolFormula(mol)
+        return {'formula': formula}