diff --git a/src/somef/header_analysis.py b/src/somef/header_analysis.py index 1545fdbb..809f1fa5 100644 --- a/src/somef/header_analysis.py +++ b/src/somef/header_analysis.py @@ -109,7 +109,9 @@ def extract_header_content(text): df = pd.concat(dfs, ignore_index=True) # for i, j in zip(header, content): # df = df.append({'Header': i, 'Content': j, 'ParentHeader': parent_headers[i]}, ignore_index=True) - df['Content'].replace('', np.nan, inplace=True) + # df['Content'].replace('', np.nan, inplace=True) + df['Content'] = df['Content'].replace('', np.nan) + df.dropna(subset=['Content'], inplace=True) return df, none_header_content @@ -221,13 +223,15 @@ def extract_categories(repo_data, repository_metadata: Result): data.at[i, 'Group'] = data['GroupParent'][i] data = data.drop(columns=['GroupParent']) if len(data['Group'].iloc[0]) == 0: - data['Group'].iloc[0] = ['unknown'] + # data['Group'].iloc[0] = ['unknown'] + data.loc[0, 'Group'] = ['unknown'] groups = data.apply(lambda x: pd.Series(x['Group']), axis=1).stack().reset_index(level=1, drop=True) groups.name = 'Group' data = data.drop('Group', axis=1).join(groups) if data['Group'].iloc[0] == 'unknown': - data['Group'].iloc[0] = np.NaN + # data['Group'].iloc[0] = np.NaN + data.loc[0, 'Group'] = np.nan # to json group = data.loc[(data['Group'] != 'None') & pd.notna(data['Group'])] diff --git a/src/somef/parser/bower_parser.py b/src/somef/parser/bower_parser.py index f67666af..14793886 100644 --- a/src/somef/parser/bower_parser.py +++ b/src/somef/parser/bower_parser.py @@ -24,7 +24,8 @@ def parse_bower_json_file(file_path, metadata_result: Result, source): metadata_result.add_result( constants.CAT_HAS_PACKAGE_FILE, { - "value": "bower.json", + # "value": "bower.json", + "value": source, "type": constants.URL, }, 1, diff --git a/src/somef/parser/cabal_parser.py b/src/somef/parser/cabal_parser.py index 9bc62662..74ed7bd2 100644 --- a/src/somef/parser/cabal_parser.py +++ b/src/somef/parser/cabal_parser.py @@ -24,7 +24,8 @@ def parse_cabal_file(file_path, metadata_result: Result, source): metadata_result.add_result( constants.CAT_HAS_PACKAGE_FILE, { - "value": Path(file_path).name, + # "value": Path(file_path).name, + "value": source, "type": constants.URL, }, 1, diff --git a/src/somef/parser/composer_parser.py b/src/somef/parser/composer_parser.py index 664c9db2..d1b07946 100644 --- a/src/somef/parser/composer_parser.py +++ b/src/somef/parser/composer_parser.py @@ -25,7 +25,8 @@ def parse_composer_json(file_path, metadata_result: Result, source): metadata_result.add_result( constants.CAT_HAS_PACKAGE_FILE, { - "value": "composer.json", + # "value": "composer.json", + "value": source, "type": constants.URL, }, 1, diff --git a/src/somef/parser/description_parser.py b/src/somef/parser/description_parser.py index b3604340..0da8f8fd 100644 --- a/src/somef/parser/description_parser.py +++ b/src/somef/parser/description_parser.py @@ -24,7 +24,8 @@ def parse_description_file(file_path, metadata_result: Result, source): metadata_result.add_result( constants.CAT_HAS_PACKAGE_FILE, { - "value": "DESCRIPTION", + # "value": "DESCRIPTION", + "value": source, "type": constants.URL, "source": source }, diff --git a/src/somef/parser/gemspec_parser.py b/src/somef/parser/gemspec_parser.py index 0df763b5..6714b012 100644 --- a/src/somef/parser/gemspec_parser.py +++ b/src/somef/parser/gemspec_parser.py @@ -27,7 +27,8 @@ def parse_gemspec_file(file_path, metadata_result: Result, source): metadata_result.add_result( constants.CAT_HAS_PACKAGE_FILE, { - "value": Path(file_path).name, + # "value": Path(file_path).name, + "value": source, "type": constants.URL, }, 1, diff --git a/src/somef/parser/package_json_parser.py b/src/somef/parser/package_json_parser.py index 8247645b..a0884c5e 100644 --- a/src/somef/parser/package_json_parser.py +++ b/src/somef/parser/package_json_parser.py @@ -188,7 +188,8 @@ def parse_package_json_file(file_path, metadata_result: Result, source): metadata_result.add_result( constants.CAT_HAS_PACKAGE_FILE, { - "value": "package.json", + # "value": "package.json", + "value": source, "type": constants.URL, }, 1, diff --git a/src/somef/parser/pom_xml_parser.py b/src/somef/parser/pom_xml_parser.py index ddedbdae..dfb9c1cc 100644 --- a/src/somef/parser/pom_xml_parser.py +++ b/src/somef/parser/pom_xml_parser.py @@ -233,7 +233,8 @@ def parse_pom_file(file_path, metadata_result: Result, source): metadata_result.add_result( constants.CAT_HAS_PACKAGE_FILE, { - "value": "pom.xml", + # "value": "pom.xml", + "value": source, "type": constants.URL }, 1, diff --git a/src/somef/parser/toml_parser.py b/src/somef/parser/toml_parser.py index 202c0cc9..e87269d0 100644 --- a/src/somef/parser/toml_parser.py +++ b/src/somef/parser/toml_parser.py @@ -45,7 +45,8 @@ def parse_toml_file(file_path, metadata_result: Result, source): metadata_result.add_result( constants.CAT_HAS_PACKAGE_FILE, { - "value": display_name, + # "value": display_name, + "value": source, "type": constants.URL, }, 1, diff --git a/src/somef/regular_expressions.py b/src/somef/regular_expressions.py index bfe5dc00..6bfe346a 100644 --- a/src/somef/regular_expressions.py +++ b/src/somef/regular_expressions.py @@ -72,7 +72,7 @@ def extract_title_old(unfiltered_text): # header declared with ==== title = "" if len(underline_header) != 0: - title = re.split('.+[=]+[\n]+', unfiltered_text)[0].strip() + title = re.split(r'.+[=]+[\n]+', unfiltered_text)[0].strip() else: # The first occurrence is assumed to be the title. title = re.findall(r'# .+', unfiltered_text)[0] @@ -81,7 +81,7 @@ def extract_title_old(unfiltered_text): title = title[1:].strip() # Remove other markup (links, etc.) if "[!" in title: - title = re.split('\[\!', title)[0].strip() + title = re.split(r'\[\!', title)[0].strip() return title @@ -689,6 +689,7 @@ def extract_readthedocs_badgeds(readme_text, repository_metadata: Result, source # RST for match in re.findall(constants.REGEXP_READTHEDOCS_RST, readme_text): + print(match) if isinstance(match, tuple): urls.update([u for u in match if u]) elif match: @@ -702,15 +703,21 @@ def extract_readthedocs_badgeds(readme_text, repository_metadata: Result, source urls.add(match) # HTML - for match in re.findall(constants.REGEXP_READTHEDOCS_HTML, readme_text): + pattern_html = re.compile(constants.REGEXP_READTHEDOCS_HTML, flags=re.VERBOSE |re.DOTALL | re.IGNORECASE) + for match in pattern_html.findall(readme_text): if isinstance(match, tuple): urls.update([u for u in match if u]) elif match: urls.add(match) for url in urls: + if "pypi.org/project" in url: + category = constants.CAT_PACKAGE_DISTRIBUTION + else: + category = constants.CAT_DOCUMENTATION + repository_metadata.add_result( - constants.CAT_DOCUMENTATION, + category, { constants.PROP_TYPE: constants.URL, constants.PROP_VALUE: url @@ -722,28 +729,28 @@ def extract_readthedocs_badgeds(readme_text, repository_metadata: Result, source return repository_metadata -def extract_package_manager_badgeds(readme_text, repository_metadata: Result, source) -> Result: - """ - Function that takes the text of a readme file and searches if there are package manager badges. - Parameters - ---------- - @param readme_text: Text of the readme - @param repository_metadata: Result with all the findings in the repo - @param source: source file on top of which the extraction is performed (provenance) - Returns - ------- - @returns Result with the package badges found - """ - package_manager_badges = re.findall(constants.REGEXP_READTHEDOCS_BADGES, readme_text, re.DOTALL) - for package in package_manager_badges: - repository_metadata.add_result(constants.CAT_DOCUMENTATION, - { - constants.PROP_TYPE: constants.URL, - constants.PROP_VALUE: package - }, 1, constants.TECHNIQUE_REGULAR_EXPRESSION, source) +# def extract_package_manager_badgeds(readme_text, repository_metadata: Result, source) -> Result: +# """ +# Function that takes the text of a readme file and searches if there are package manager badges. +# Parameters +# ---------- +# @param readme_text: Text of the readme +# @param repository_metadata: Result with all the findings in the repo +# @param source: source file on top of which the extraction is performed (provenance) +# Returns +# ------- +# @returns Result with the package badges found +# """ +# package_manager_badges = re.findall(constants.REGEXP_READTHEDOCS_BADGES, readme_text, re.DOTALL) +# for package in package_manager_badges: +# repository_metadata.add_result(constants.CAT_DOCUMENTATION, +# { +# constants.PROP_TYPE: constants.URL, +# constants.PROP_VALUE: package +# }, 1, constants.TECHNIQUE_REGULAR_EXPRESSION, source) - return repository_metadata +# return repository_metadata def extract_swh_badges(readme_text, repository_metadata: Result, source) -> Result: diff --git a/src/somef/test/test_bower_parser.py b/src/somef/test/test_bower_parser.py index bb51f551..06e1e907 100644 --- a/src/somef/test/test_bower_parser.py +++ b/src/somef/test/test_bower_parser.py @@ -16,10 +16,12 @@ def test_parse_bower_json(self): result = Result() - metadata_result = parse_bower_json_file(bower_file_path, result, "https://example.org/bower.json") + # metadata_result = parse_bower_json_file(bower_file_path, result, "https://example.org/bower.json") + metadata_result = parse_bower_json_file(bower_file_path, result, bower_file_path) package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, []) self.assertTrue(len(package_results) > 0, "No package file info found") - self.assertEqual(package_results[0]["result"]["value"], "bower.json") + # self.assertEqual(package_results[0]["result"]["value"], "bower.json") + self.assertEqual(package_results[0]["result"]["value"], bower_file_path) self.assertEqual(package_results[0]["result"]["type"], constants.URL) name_results = metadata_result.results.get(constants.CAT_NAME, []) @@ -70,10 +72,12 @@ def test_parse_2_bower_json(self): result = Result() - metadata_result = parse_bower_json_file(bower_file_path, result, "https://example.org/bower.json") + # metadata_result = parse_bower_json_file(bower_file_path, result, "https://example.org/bower.json") + metadata_result = parse_bower_json_file(bower_file_path, result, bower_file_path) package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, []) self.assertTrue(len(package_results) > 0, "No package file info found") - self.assertEqual(package_results[0]["result"]["value"], "bower.json") + # self.assertEqual(package_results[0]["result"]["value"], "bower.json") + self.assertEqual(package_results[0]["result"]["value"], bower_file_path) self.assertEqual(package_results[0]["result"]["type"], constants.URL) name_results = metadata_result.results.get(constants.CAT_NAME, []) diff --git a/src/somef/test/test_cabal_parser.py b/src/somef/test/test_cabal_parser.py index c6259c99..fc50e079 100644 --- a/src/somef/test/test_cabal_parser.py +++ b/src/somef/test/test_cabal_parser.py @@ -15,11 +15,12 @@ def test_parse_cabal(self): cabal_file_path = test_data_repositories + os.path.sep + "unused" + os.path.sep + "unused.cabal" result = Result() - metadata_result = parse_cabal_file(cabal_file_path, result, "https://example.org/unused.cabal") - + # metadata_result = parse_cabal_file(cabal_file_path, result, "https://example.org/unused.cabal") + metadata_result = parse_cabal_file(cabal_file_path, result, cabal_file_path) package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, []) self.assertTrue(len(package_results) > 0, "No package file info found") - self.assertEqual(package_results[0]["result"]["value"], "unused.cabal") + # self.assertEqual(package_results[0]["result"]["value"], "unused.cabal") + self.assertEqual(package_results[0]["result"]["value"], cabal_file_path) self.assertEqual(package_results[0]["result"]["type"], constants.URL) id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, []) @@ -56,11 +57,12 @@ def test_parse_2_cabal(self): cabal_file_path = test_data_repositories + os.path.sep + "haskell" + os.path.sep + "cabal.cabal" result = Result() - metadata_result = parse_cabal_file(cabal_file_path, result, "https://example.org/cabal.cabal") - + # metadata_result = parse_cabal_file(cabal_file_path, result, "https://example.org/cabal.cabal") + metadata_result = parse_cabal_file(cabal_file_path, result, cabal_file_path) package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, []) self.assertTrue(len(package_results) > 0, "No package file info found") - self.assertEqual(package_results[0]["result"]["value"], "cabal.cabal") + # self.assertEqual(package_results[0]["result"]["value"], "cabal.cabal") + self.assertEqual(package_results[0]["result"]["value"], cabal_file_path) self.assertEqual(package_results[0]["result"]["type"], constants.URL) description_results = metadata_result.results.get(constants.CAT_DESCRIPTION, []) self.assertTrue(len(description_results) > 0, "No description found") diff --git a/src/somef/test/test_description_parser.py b/src/somef/test/test_description_parser.py index b613c71c..d9951b59 100644 --- a/src/somef/test/test_description_parser.py +++ b/src/somef/test/test_description_parser.py @@ -15,7 +15,9 @@ def test_description(self): description_file_path = test_data_repositories + os.path.sep + "tidyverse" + os.path.sep + "DESCRIPTION" result = Result() - metadata_result = parse_description_file(description_file_path, result, "https://example.org/DESCRIPTION") + # metadata_result = parse_description_file(description_file_path, result, "https://example.org/DESCRIPTION") + + metadata_result = parse_description_file(description_file_path, result, description_file_path) # print(metadata_result.results) @@ -28,7 +30,9 @@ def test_description(self): package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, []) self.assertTrue(len(package_results) > 0, "No package file info found") - self.assertEqual(package_results[0]["result"]["value"], "DESCRIPTION") + # self.assertEqual(package_results[0]["result"]["value"], "DESCRIPTION") + # self.assertEqual(package_results[0]["result"]["value"], "https://example.org/DESCRIPTION") + self.assertEqual(package_results[0]["result"]["value"], description_file_path) self.assertEqual(package_results[0]["result"]["type"], constants.URL) id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, []) @@ -49,17 +53,18 @@ def test_description(self): def test_description_2(self): description_file_path = test_data_repositories + os.path.sep + "ggplot2" + os.path.sep + "DESCRIPTION" + result = Result() - metadata_result = parse_description_file(description_file_path, result, "https://example.org/DESCRIPTION") - + # metadata_result = parse_description_file(description_file_path, result, "https://example.org/DESCRIPTION") + metadata_result = parse_description_file(description_file_path, result, description_file_path) authors_results = metadata_result.results.get(constants.CAT_AUTHORS, []) self.assertTrue(len(authors_results) == 11, "Expected 11 authors") self.assertEqual(authors_results[1]["result"]["value"],"Winston Chang","Second author name mismatch") package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, []) self.assertTrue(len(package_results) > 0, "No package file info found") - self.assertEqual(package_results[0]["result"]["value"], "DESCRIPTION") + self.assertEqual(package_results[0]["result"]["value"], description_file_path) self.assertEqual(package_results[0]["result"]["type"], constants.URL) id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, []) diff --git a/src/somef/test/test_gemspec_parser.py b/src/somef/test/test_gemspec_parser.py index afa07545..9b0e736f 100644 --- a/src/somef/test/test_gemspec_parser.py +++ b/src/somef/test/test_gemspec_parser.py @@ -15,14 +15,16 @@ def test_parse_gemspec(self): gemspec_file_path = test_data_repositories + os.path.sep + "bootstrap-datepicker-rails" + os.path.sep + "bootstrap-datepicker-rails.gemspec" result = Result() - metadata_result = parse_gemspec_file(gemspec_file_path, result, "https://example.org/bootstrap-datepicker-rails.gemspec") + # metadata_result = parse_gemspec_file(gemspec_file_path, result, "https://example.org/bootstrap-datepicker-rails.gemspec") + metadata_result = parse_gemspec_file(gemspec_file_path, result, gemspec_file_path) authors_results = metadata_result.results.get(constants.CAT_AUTHORS, []) self.assertTrue(len(authors_results) == 2, "Expected two authors") package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, []) self.assertTrue(len(package_results) > 0, "No package file info found") - self.assertEqual(package_results[0]["result"]["value"], "bootstrap-datepicker-rails.gemspec") + # self.assertEqual(package_results[0]["result"]["value"], "bootstrap-datepicker-rails.gemspec") + self.assertEqual(package_results[0]["result"]["value"], gemspec_file_path) self.assertEqual(package_results[0]["result"]["type"], constants.URL) id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, []) diff --git a/src/somef/test/test_package_json_parser.py b/src/somef/test/test_package_json_parser.py index 9e91dd6c..e106b96e 100644 --- a/src/somef/test/test_package_json_parser.py +++ b/src/somef/test/test_package_json_parser.py @@ -16,8 +16,8 @@ def test_parse_package_json_file(self): package_file_path = test_data_path + "package_neors.json" result = Result() - metadata_result = parse_package_json_file(package_file_path, result, "http://example.com/package_neors.json") - + # metadata_result = parse_package_json_file(package_file_path, result, "http://example.com/package_neors.json") + metadata_result = parse_package_json_file(package_file_path, result, package_file_path) package_id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, []) self.assertTrue(len(package_id_results) > 0, "No package ID found") self.assertEqual(package_id_results[0]["result"]["value"], "jsonlab") @@ -58,7 +58,9 @@ def test_parse_package_json_file(self): package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, []) self.assertTrue(len(package_results) > 0, "No package file info found") - self.assertEqual(package_results[0]["result"]["value"], "package.json") + # self.assertEqual(package_results[0]["result"]["value"], "package.json") + # self.assertEqual(package_results[0]["result"]["value"], "http://example.com/package_neors.json") + self.assertEqual(package_results[0]["result"]["value"],package_file_path) self.assertEqual(package_results[0]["result"]["type"], constants.URL) keywords_results = metadata_result.results.get(constants.CAT_KEYWORDS, []) diff --git a/src/somef/test/test_pom_parser.py b/src/somef/test/test_pom_parser.py index 6a28ffb2..f16b57d3 100644 --- a/src/somef/test/test_pom_parser.py +++ b/src/somef/test/test_pom_parser.py @@ -17,8 +17,8 @@ def test_parse_pom_file(self): pom_xml_parser.processed_pom = False pom_file_path = test_data_repositories + os.path.sep + "Widoco" + os.path.sep + "pom.xml" result = Result() - - metadata_result = parse_pom_file(pom_file_path, result, "https://example.org/pom.xml") + + metadata_result = parse_pom_file(pom_file_path, result, pom_file_path) identifier_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, []) self.assertTrue(len(identifier_results) > 0, "No identifier found") @@ -34,7 +34,8 @@ def test_parse_pom_file(self): package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, []) self.assertTrue(len(package_results) > 0, "No package file info found") - self.assertEqual(package_results[0]["result"]["value"], "pom.xml") + # self.assertEqual(package_results[0]["result"]["value"], "pom.xml") + self.assertEqual(package_results[0]["result"]["value"], pom_file_path) self.assertEqual(package_results[0]["result"]["type"], constants.URL) requirements_results = metadata_result.results.get(constants.CAT_REQUIREMENTS, []) diff --git a/src/somef/test/test_regular_expressions.py b/src/somef/test/test_regular_expressions.py index 76f10a7d..ea801af4 100644 --- a/src/somef/test/test_regular_expressions.py +++ b/src/somef/test/test_regular_expressions.py @@ -446,6 +446,7 @@ def test_issue_771(self): def test_issue_860(self): """Test designed to check redthedocs links are extracted correctly when multiple links are present """ documentation_values = [] + package_values = [] for readme_file in ["README-menty.md", "README-uncbiag.md"]: with open(test_data_path + readme_file, "r") as data_file: @@ -461,14 +462,27 @@ def test_issue_860(self): if isinstance(value, str): documentation_values.append(value) + if "package_distribution" in documentation.results: + for result in documentation.results["package_distribution"]: + value = result.get("result", {}).get("value") + if isinstance(value, str): + package_values.append(value) + expected_doc_urls = { - "https://pypi.org/project/mentpy", + "https://docs.mentpy.com/en/latest/?badge=latest", "https://icon.readthedocs.io/en/master/" } - + + expected_package_urls = { + "https://pypi.org/project/mentpy" + } + assert expected_doc_urls.issubset(set(documentation_values)), ( f"Expected documentation URLs {expected_doc_urls} not found in {documentation_values}" ) + assert expected_package_urls.issubset(set(package_values)), ( + f"Pypy package {expected_package_urls} not foun in package_distribution: {package_values}" + ) def test_readme_rst_readthedocs(self): """Test designed to check whether rst readmes get stuck in extracting documentation """ diff --git a/src/somef/test/test_toml_parser.py b/src/somef/test/test_toml_parser.py index b50dd4a6..9924b663 100644 --- a/src/somef/test/test_toml_parser.py +++ b/src/somef/test/test_toml_parser.py @@ -56,11 +56,16 @@ def test_parse_cargo_toml(self): ) result = Result() - parse_toml_file(self.cargo_toml_path, result, "test") + # parse_toml_file(self.cargo_toml_path, result, "test") + print("self.cargo_toml_path:", self.cargo_toml_path) + # parse_toml_file(self.cargo_toml_path, result, "http://example.com/rustdesk/Cargo.toml") + parse_toml_file(self.cargo_toml_path, result, self.cargo_toml_path) self.assertIn(constants.CAT_HAS_PACKAGE_FILE, result.results) package_file = result.results[constants.CAT_HAS_PACKAGE_FILE][0]["result"]["value"] - self.assertEqual(package_file, "Cargo.toml") + + # self.assertEqual(package_file, "Cargo.toml") + self.assertEqual(package_file, self.cargo_toml_path) self.assertIn(constants.CAT_PACKAGE_ID, result.results) package_id = result.results[constants.CAT_PACKAGE_ID][0]["result"]["value"] @@ -94,15 +99,22 @@ def test_parse_pluto_project_toml(self): """Test parsing Pluto's Project.toml (Julia) file""" result = Result() + # metadata_result = parse_toml_file( + # self.pluto_project_path, + # result, + # "http://example.com/repo1/Project.toml" + # ) + metadata_result = parse_toml_file( self.pluto_project_path, result, - "http://example.com/repo1/Project.toml" + self.pluto_project_path ) package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, []) self.assertTrue(len(package_results) > 0, "No package file info found") - self.assertEqual(package_results[0]["result"]["value"], "Project.toml") + # self.assertEqual(package_results[0]["result"]["value"], "Project.toml") + self.assertEqual(package_results[0]["result"]["value"], self.pluto_project_path) self.assertEqual(package_results[0]["result"]["type"], constants.URL) self.assertEqual(package_results[0]["technique"], constants.TECHNIQUE_CODE_CONFIG_PARSER) @@ -160,15 +172,21 @@ def test_parse_flux_project_toml(self): """Test parsing Flux's Project.toml (Julia) file""" result = Result() + # metadata_result = parse_toml_file( + # self.flux_project_path, + # result, + # "http://example.com/repo2/Project.toml" + # ) metadata_result = parse_toml_file( self.flux_project_path, result, - "http://example.com/repo2/Project.toml" + self.flux_project_path ) package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, []) self.assertTrue(len(package_results) > 0, "No package file info found") - self.assertEqual(package_results[0]["result"]["value"], "Project.toml") + # self.assertEqual(package_results[0]["result"]["value"], "Project.toml") + self.assertEqual(package_results[0]["result"]["value"], self.flux_project_path) package_id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, []) self.assertTrue(len(package_id_results) > 0, "No package ID found") diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index 1775da1a..14ab832e 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -54,12 +54,29 @@ REGEXP_READTHEDOCS_MD = ( r"\(\s*(https?://[^\s\)]+\.readthedocs\.io[^\s\)]*)\s*\)" ) + # REGEXP_READTHEDOCS_HTML = ( -# r"]+href=['\"](https?://[^\s\"']+\.readthedocs\.io[^\s\"']*)['\"][^>]*>" -# r"(?:\s|<[^>]+>)*" -# r"]+src=['\"]https?://readthedocs\.org/projects/[^\s/]+/badge/[^\s\"']*" +# r"]+?href=['\"](https?://[^'\"\s]+?)['\"][^>]*?>" +# r"(?:(?!)[\s\S])*?" +# r"]+?src=['\"]https?://(?:readthedocs\.org/projects/|img\.shields\.io/pypi/)[^'\"\s]*" # ) -REGEXP_READTHEDOCS_HTML = ( r"]+href=['\"](https?://[^\s\"']+)['\"][^>]*>" r"(?:\s|<[^>]+>)*" r"]+src=['\"]https?://readthedocs\.org/projects/[^\s/]+/badge/[^\s\"']*" ) + +REGEXP_READTHEDOCS_HTML = (r""" + ]*\bhref=['"](https?://[^'"\s]+)['"] + [^>]*> + (?: + [^<]+ + | + <(?!/a\b)[^>]*> + )* + ]*\bsrc=['"] + https?://(?:readthedocs\.org/projects/|img\.shields\.io/pypi/) + [^'"\s>]+ + ['"] + """ +) # For natural language citation REGEXP_DOI_NATURAL = r'10\.\d{4,9}/[-._;()/:A-Za-z0-9]+' REGEXP_YEAR_NATURAL = r'\b(19|20)\d{2}\b'