diff --git a/src/somef/header_analysis.py b/src/somef/header_analysis.py
index 1545fdbb..809f1fa5 100644
--- a/src/somef/header_analysis.py
+++ b/src/somef/header_analysis.py
@@ -109,7 +109,9 @@ def extract_header_content(text):
df = pd.concat(dfs, ignore_index=True)
# for i, j in zip(header, content):
# df = df.append({'Header': i, 'Content': j, 'ParentHeader': parent_headers[i]}, ignore_index=True)
- df['Content'].replace('', np.nan, inplace=True)
+ # df['Content'].replace('', np.nan, inplace=True)
+ df['Content'] = df['Content'].replace('', np.nan)
+
df.dropna(subset=['Content'], inplace=True)
return df, none_header_content
@@ -221,13 +223,15 @@ def extract_categories(repo_data, repository_metadata: Result):
data.at[i, 'Group'] = data['GroupParent'][i]
data = data.drop(columns=['GroupParent'])
if len(data['Group'].iloc[0]) == 0:
- data['Group'].iloc[0] = ['unknown']
+ # data['Group'].iloc[0] = ['unknown']
+ data.loc[0, 'Group'] = ['unknown']
groups = data.apply(lambda x: pd.Series(x['Group']), axis=1).stack().reset_index(level=1, drop=True)
groups.name = 'Group'
data = data.drop('Group', axis=1).join(groups)
if data['Group'].iloc[0] == 'unknown':
- data['Group'].iloc[0] = np.NaN
+ # data['Group'].iloc[0] = np.NaN
+ data.loc[0, 'Group'] = np.nan
# to json
group = data.loc[(data['Group'] != 'None') & pd.notna(data['Group'])]
diff --git a/src/somef/parser/bower_parser.py b/src/somef/parser/bower_parser.py
index f67666af..14793886 100644
--- a/src/somef/parser/bower_parser.py
+++ b/src/somef/parser/bower_parser.py
@@ -24,7 +24,8 @@ def parse_bower_json_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
- "value": "bower.json",
+ # "value": "bower.json",
+ "value": source,
"type": constants.URL,
},
1,
diff --git a/src/somef/parser/cabal_parser.py b/src/somef/parser/cabal_parser.py
index 9bc62662..74ed7bd2 100644
--- a/src/somef/parser/cabal_parser.py
+++ b/src/somef/parser/cabal_parser.py
@@ -24,7 +24,8 @@ def parse_cabal_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
- "value": Path(file_path).name,
+ # "value": Path(file_path).name,
+ "value": source,
"type": constants.URL,
},
1,
diff --git a/src/somef/parser/composer_parser.py b/src/somef/parser/composer_parser.py
index 664c9db2..d1b07946 100644
--- a/src/somef/parser/composer_parser.py
+++ b/src/somef/parser/composer_parser.py
@@ -25,7 +25,8 @@ def parse_composer_json(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
- "value": "composer.json",
+ # "value": "composer.json",
+ "value": source,
"type": constants.URL,
},
1,
diff --git a/src/somef/parser/description_parser.py b/src/somef/parser/description_parser.py
index b3604340..0da8f8fd 100644
--- a/src/somef/parser/description_parser.py
+++ b/src/somef/parser/description_parser.py
@@ -24,7 +24,8 @@ def parse_description_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
- "value": "DESCRIPTION",
+ # "value": "DESCRIPTION",
+ "value": source,
"type": constants.URL,
"source": source
},
diff --git a/src/somef/parser/gemspec_parser.py b/src/somef/parser/gemspec_parser.py
index 0df763b5..6714b012 100644
--- a/src/somef/parser/gemspec_parser.py
+++ b/src/somef/parser/gemspec_parser.py
@@ -27,7 +27,8 @@ def parse_gemspec_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
- "value": Path(file_path).name,
+ # "value": Path(file_path).name,
+ "value": source,
"type": constants.URL,
},
1,
diff --git a/src/somef/parser/package_json_parser.py b/src/somef/parser/package_json_parser.py
index 8247645b..a0884c5e 100644
--- a/src/somef/parser/package_json_parser.py
+++ b/src/somef/parser/package_json_parser.py
@@ -188,7 +188,8 @@ def parse_package_json_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
- "value": "package.json",
+ # "value": "package.json",
+ "value": source,
"type": constants.URL,
},
1,
diff --git a/src/somef/parser/pom_xml_parser.py b/src/somef/parser/pom_xml_parser.py
index ddedbdae..dfb9c1cc 100644
--- a/src/somef/parser/pom_xml_parser.py
+++ b/src/somef/parser/pom_xml_parser.py
@@ -233,7 +233,8 @@ def parse_pom_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
- "value": "pom.xml",
+ # "value": "pom.xml",
+ "value": source,
"type": constants.URL
},
1,
diff --git a/src/somef/parser/toml_parser.py b/src/somef/parser/toml_parser.py
index 202c0cc9..e87269d0 100644
--- a/src/somef/parser/toml_parser.py
+++ b/src/somef/parser/toml_parser.py
@@ -45,7 +45,8 @@ def parse_toml_file(file_path, metadata_result: Result, source):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
- "value": display_name,
+ # "value": display_name,
+ "value": source,
"type": constants.URL,
},
1,
diff --git a/src/somef/regular_expressions.py b/src/somef/regular_expressions.py
index bfe5dc00..6bfe346a 100644
--- a/src/somef/regular_expressions.py
+++ b/src/somef/regular_expressions.py
@@ -72,7 +72,7 @@ def extract_title_old(unfiltered_text):
# header declared with ====
title = ""
if len(underline_header) != 0:
- title = re.split('.+[=]+[\n]+', unfiltered_text)[0].strip()
+ title = re.split(r'.+[=]+[\n]+', unfiltered_text)[0].strip()
else:
# The first occurrence is assumed to be the title.
title = re.findall(r'# .+', unfiltered_text)[0]
@@ -81,7 +81,7 @@ def extract_title_old(unfiltered_text):
title = title[1:].strip()
# Remove other markup (links, etc.)
if "[!" in title:
- title = re.split('\[\!', title)[0].strip()
+ title = re.split(r'\[\!', title)[0].strip()
return title
@@ -689,6 +689,7 @@ def extract_readthedocs_badgeds(readme_text, repository_metadata: Result, source
# RST
for match in re.findall(constants.REGEXP_READTHEDOCS_RST, readme_text):
+ print(match)
if isinstance(match, tuple):
urls.update([u for u in match if u])
elif match:
@@ -702,15 +703,21 @@ def extract_readthedocs_badgeds(readme_text, repository_metadata: Result, source
urls.add(match)
# HTML
- for match in re.findall(constants.REGEXP_READTHEDOCS_HTML, readme_text):
+ pattern_html = re.compile(constants.REGEXP_READTHEDOCS_HTML, flags=re.VERBOSE |re.DOTALL | re.IGNORECASE)
+ for match in pattern_html.findall(readme_text):
if isinstance(match, tuple):
urls.update([u for u in match if u])
elif match:
urls.add(match)
for url in urls:
+ if "pypi.org/project" in url:
+ category = constants.CAT_PACKAGE_DISTRIBUTION
+ else:
+ category = constants.CAT_DOCUMENTATION
+
repository_metadata.add_result(
- constants.CAT_DOCUMENTATION,
+ category,
{
constants.PROP_TYPE: constants.URL,
constants.PROP_VALUE: url
@@ -722,28 +729,28 @@ def extract_readthedocs_badgeds(readme_text, repository_metadata: Result, source
return repository_metadata
-def extract_package_manager_badgeds(readme_text, repository_metadata: Result, source) -> Result:
- """
- Function that takes the text of a readme file and searches if there are package manager badges.
- Parameters
- ----------
- @param readme_text: Text of the readme
- @param repository_metadata: Result with all the findings in the repo
- @param source: source file on top of which the extraction is performed (provenance)
- Returns
- -------
- @returns Result with the package badges found
- """
- package_manager_badges = re.findall(constants.REGEXP_READTHEDOCS_BADGES, readme_text, re.DOTALL)
- for package in package_manager_badges:
- repository_metadata.add_result(constants.CAT_DOCUMENTATION,
- {
- constants.PROP_TYPE: constants.URL,
- constants.PROP_VALUE: package
- }, 1, constants.TECHNIQUE_REGULAR_EXPRESSION, source)
+# def extract_package_manager_badgeds(readme_text, repository_metadata: Result, source) -> Result:
+# """
+# Function that takes the text of a readme file and searches if there are package manager badges.
+# Parameters
+# ----------
+# @param readme_text: Text of the readme
+# @param repository_metadata: Result with all the findings in the repo
+# @param source: source file on top of which the extraction is performed (provenance)
+# Returns
+# -------
+# @returns Result with the package badges found
+# """
+# package_manager_badges = re.findall(constants.REGEXP_READTHEDOCS_BADGES, readme_text, re.DOTALL)
+# for package in package_manager_badges:
+# repository_metadata.add_result(constants.CAT_DOCUMENTATION,
+# {
+# constants.PROP_TYPE: constants.URL,
+# constants.PROP_VALUE: package
+# }, 1, constants.TECHNIQUE_REGULAR_EXPRESSION, source)
- return repository_metadata
+# return repository_metadata
def extract_swh_badges(readme_text, repository_metadata: Result, source) -> Result:
diff --git a/src/somef/test/test_bower_parser.py b/src/somef/test/test_bower_parser.py
index bb51f551..06e1e907 100644
--- a/src/somef/test/test_bower_parser.py
+++ b/src/somef/test/test_bower_parser.py
@@ -16,10 +16,12 @@ def test_parse_bower_json(self):
result = Result()
- metadata_result = parse_bower_json_file(bower_file_path, result, "https://example.org/bower.json")
+ # metadata_result = parse_bower_json_file(bower_file_path, result, "https://example.org/bower.json")
+ metadata_result = parse_bower_json_file(bower_file_path, result, bower_file_path)
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "bower.json")
+ # self.assertEqual(package_results[0]["result"]["value"], "bower.json")
+ self.assertEqual(package_results[0]["result"]["value"], bower_file_path)
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
name_results = metadata_result.results.get(constants.CAT_NAME, [])
@@ -70,10 +72,12 @@ def test_parse_2_bower_json(self):
result = Result()
- metadata_result = parse_bower_json_file(bower_file_path, result, "https://example.org/bower.json")
+ # metadata_result = parse_bower_json_file(bower_file_path, result, "https://example.org/bower.json")
+ metadata_result = parse_bower_json_file(bower_file_path, result, bower_file_path)
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "bower.json")
+ # self.assertEqual(package_results[0]["result"]["value"], "bower.json")
+ self.assertEqual(package_results[0]["result"]["value"], bower_file_path)
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
name_results = metadata_result.results.get(constants.CAT_NAME, [])
diff --git a/src/somef/test/test_cabal_parser.py b/src/somef/test/test_cabal_parser.py
index c6259c99..fc50e079 100644
--- a/src/somef/test/test_cabal_parser.py
+++ b/src/somef/test/test_cabal_parser.py
@@ -15,11 +15,12 @@ def test_parse_cabal(self):
cabal_file_path = test_data_repositories + os.path.sep + "unused" + os.path.sep + "unused.cabal"
result = Result()
- metadata_result = parse_cabal_file(cabal_file_path, result, "https://example.org/unused.cabal")
-
+ # metadata_result = parse_cabal_file(cabal_file_path, result, "https://example.org/unused.cabal")
+ metadata_result = parse_cabal_file(cabal_file_path, result, cabal_file_path)
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "unused.cabal")
+ # self.assertEqual(package_results[0]["result"]["value"], "unused.cabal")
+ self.assertEqual(package_results[0]["result"]["value"], cabal_file_path)
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, [])
@@ -56,11 +57,12 @@ def test_parse_2_cabal(self):
cabal_file_path = test_data_repositories + os.path.sep + "haskell" + os.path.sep + "cabal.cabal"
result = Result()
- metadata_result = parse_cabal_file(cabal_file_path, result, "https://example.org/cabal.cabal")
-
+ # metadata_result = parse_cabal_file(cabal_file_path, result, "https://example.org/cabal.cabal")
+ metadata_result = parse_cabal_file(cabal_file_path, result, cabal_file_path)
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "cabal.cabal")
+ # self.assertEqual(package_results[0]["result"]["value"], "cabal.cabal")
+ self.assertEqual(package_results[0]["result"]["value"], cabal_file_path)
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
description_results = metadata_result.results.get(constants.CAT_DESCRIPTION, [])
self.assertTrue(len(description_results) > 0, "No description found")
diff --git a/src/somef/test/test_description_parser.py b/src/somef/test/test_description_parser.py
index b613c71c..d9951b59 100644
--- a/src/somef/test/test_description_parser.py
+++ b/src/somef/test/test_description_parser.py
@@ -15,7 +15,9 @@ def test_description(self):
description_file_path = test_data_repositories + os.path.sep + "tidyverse" + os.path.sep + "DESCRIPTION"
result = Result()
- metadata_result = parse_description_file(description_file_path, result, "https://example.org/DESCRIPTION")
+ # metadata_result = parse_description_file(description_file_path, result, "https://example.org/DESCRIPTION")
+
+ metadata_result = parse_description_file(description_file_path, result, description_file_path)
# print(metadata_result.results)
@@ -28,7 +30,9 @@ def test_description(self):
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "DESCRIPTION")
+ # self.assertEqual(package_results[0]["result"]["value"], "DESCRIPTION")
+ # self.assertEqual(package_results[0]["result"]["value"], "https://example.org/DESCRIPTION")
+ self.assertEqual(package_results[0]["result"]["value"], description_file_path)
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, [])
@@ -49,17 +53,18 @@ def test_description(self):
def test_description_2(self):
description_file_path = test_data_repositories + os.path.sep + "ggplot2" + os.path.sep + "DESCRIPTION"
+
result = Result()
- metadata_result = parse_description_file(description_file_path, result, "https://example.org/DESCRIPTION")
-
+ # metadata_result = parse_description_file(description_file_path, result, "https://example.org/DESCRIPTION")
+ metadata_result = parse_description_file(description_file_path, result, description_file_path)
authors_results = metadata_result.results.get(constants.CAT_AUTHORS, [])
self.assertTrue(len(authors_results) == 11, "Expected 11 authors")
self.assertEqual(authors_results[1]["result"]["value"],"Winston Chang","Second author name mismatch")
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "DESCRIPTION")
+ self.assertEqual(package_results[0]["result"]["value"], description_file_path)
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, [])
diff --git a/src/somef/test/test_gemspec_parser.py b/src/somef/test/test_gemspec_parser.py
index afa07545..9b0e736f 100644
--- a/src/somef/test/test_gemspec_parser.py
+++ b/src/somef/test/test_gemspec_parser.py
@@ -15,14 +15,16 @@ def test_parse_gemspec(self):
gemspec_file_path = test_data_repositories + os.path.sep + "bootstrap-datepicker-rails" + os.path.sep + "bootstrap-datepicker-rails.gemspec"
result = Result()
- metadata_result = parse_gemspec_file(gemspec_file_path, result, "https://example.org/bootstrap-datepicker-rails.gemspec")
+ # metadata_result = parse_gemspec_file(gemspec_file_path, result, "https://example.org/bootstrap-datepicker-rails.gemspec")
+ metadata_result = parse_gemspec_file(gemspec_file_path, result, gemspec_file_path)
authors_results = metadata_result.results.get(constants.CAT_AUTHORS, [])
self.assertTrue(len(authors_results) == 2, "Expected two authors")
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "bootstrap-datepicker-rails.gemspec")
+ # self.assertEqual(package_results[0]["result"]["value"], "bootstrap-datepicker-rails.gemspec")
+ self.assertEqual(package_results[0]["result"]["value"], gemspec_file_path)
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, [])
diff --git a/src/somef/test/test_package_json_parser.py b/src/somef/test/test_package_json_parser.py
index 9e91dd6c..e106b96e 100644
--- a/src/somef/test/test_package_json_parser.py
+++ b/src/somef/test/test_package_json_parser.py
@@ -16,8 +16,8 @@ def test_parse_package_json_file(self):
package_file_path = test_data_path + "package_neors.json"
result = Result()
- metadata_result = parse_package_json_file(package_file_path, result, "http://example.com/package_neors.json")
-
+ # metadata_result = parse_package_json_file(package_file_path, result, "http://example.com/package_neors.json")
+ metadata_result = parse_package_json_file(package_file_path, result, package_file_path)
package_id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, [])
self.assertTrue(len(package_id_results) > 0, "No package ID found")
self.assertEqual(package_id_results[0]["result"]["value"], "jsonlab")
@@ -58,7 +58,9 @@ def test_parse_package_json_file(self):
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "package.json")
+ # self.assertEqual(package_results[0]["result"]["value"], "package.json")
+ # self.assertEqual(package_results[0]["result"]["value"], "http://example.com/package_neors.json")
+ self.assertEqual(package_results[0]["result"]["value"],package_file_path)
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
keywords_results = metadata_result.results.get(constants.CAT_KEYWORDS, [])
diff --git a/src/somef/test/test_pom_parser.py b/src/somef/test/test_pom_parser.py
index 6a28ffb2..f16b57d3 100644
--- a/src/somef/test/test_pom_parser.py
+++ b/src/somef/test/test_pom_parser.py
@@ -17,8 +17,8 @@ def test_parse_pom_file(self):
pom_xml_parser.processed_pom = False
pom_file_path = test_data_repositories + os.path.sep + "Widoco" + os.path.sep + "pom.xml"
result = Result()
-
- metadata_result = parse_pom_file(pom_file_path, result, "https://example.org/pom.xml")
+
+ metadata_result = parse_pom_file(pom_file_path, result, pom_file_path)
identifier_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, [])
self.assertTrue(len(identifier_results) > 0, "No identifier found")
@@ -34,7 +34,8 @@ def test_parse_pom_file(self):
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "pom.xml")
+ # self.assertEqual(package_results[0]["result"]["value"], "pom.xml")
+ self.assertEqual(package_results[0]["result"]["value"], pom_file_path)
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
requirements_results = metadata_result.results.get(constants.CAT_REQUIREMENTS, [])
diff --git a/src/somef/test/test_regular_expressions.py b/src/somef/test/test_regular_expressions.py
index 76f10a7d..ea801af4 100644
--- a/src/somef/test/test_regular_expressions.py
+++ b/src/somef/test/test_regular_expressions.py
@@ -446,6 +446,7 @@ def test_issue_771(self):
def test_issue_860(self):
"""Test designed to check redthedocs links are extracted correctly when multiple links are present """
documentation_values = []
+ package_values = []
for readme_file in ["README-menty.md", "README-uncbiag.md"]:
with open(test_data_path + readme_file, "r") as data_file:
@@ -461,14 +462,27 @@ def test_issue_860(self):
if isinstance(value, str):
documentation_values.append(value)
+ if "package_distribution" in documentation.results:
+ for result in documentation.results["package_distribution"]:
+ value = result.get("result", {}).get("value")
+ if isinstance(value, str):
+ package_values.append(value)
+
expected_doc_urls = {
- "https://pypi.org/project/mentpy",
+ "https://docs.mentpy.com/en/latest/?badge=latest",
"https://icon.readthedocs.io/en/master/"
}
-
+
+ expected_package_urls = {
+ "https://pypi.org/project/mentpy"
+ }
+
assert expected_doc_urls.issubset(set(documentation_values)), (
f"Expected documentation URLs {expected_doc_urls} not found in {documentation_values}"
)
+ assert expected_package_urls.issubset(set(package_values)), (
+ f"Pypy package {expected_package_urls} not foun in package_distribution: {package_values}"
+ )
def test_readme_rst_readthedocs(self):
"""Test designed to check whether rst readmes get stuck in extracting documentation """
diff --git a/src/somef/test/test_toml_parser.py b/src/somef/test/test_toml_parser.py
index b50dd4a6..9924b663 100644
--- a/src/somef/test/test_toml_parser.py
+++ b/src/somef/test/test_toml_parser.py
@@ -56,11 +56,16 @@ def test_parse_cargo_toml(self):
)
result = Result()
- parse_toml_file(self.cargo_toml_path, result, "test")
+ # parse_toml_file(self.cargo_toml_path, result, "test")
+ print("self.cargo_toml_path:", self.cargo_toml_path)
+ # parse_toml_file(self.cargo_toml_path, result, "http://example.com/rustdesk/Cargo.toml")
+ parse_toml_file(self.cargo_toml_path, result, self.cargo_toml_path)
self.assertIn(constants.CAT_HAS_PACKAGE_FILE, result.results)
package_file = result.results[constants.CAT_HAS_PACKAGE_FILE][0]["result"]["value"]
- self.assertEqual(package_file, "Cargo.toml")
+
+ # self.assertEqual(package_file, "Cargo.toml")
+ self.assertEqual(package_file, self.cargo_toml_path)
self.assertIn(constants.CAT_PACKAGE_ID, result.results)
package_id = result.results[constants.CAT_PACKAGE_ID][0]["result"]["value"]
@@ -94,15 +99,22 @@ def test_parse_pluto_project_toml(self):
"""Test parsing Pluto's Project.toml (Julia) file"""
result = Result()
+ # metadata_result = parse_toml_file(
+ # self.pluto_project_path,
+ # result,
+ # "http://example.com/repo1/Project.toml"
+ # )
+
metadata_result = parse_toml_file(
self.pluto_project_path,
result,
- "http://example.com/repo1/Project.toml"
+ self.pluto_project_path
)
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "Project.toml")
+ # self.assertEqual(package_results[0]["result"]["value"], "Project.toml")
+ self.assertEqual(package_results[0]["result"]["value"], self.pluto_project_path)
self.assertEqual(package_results[0]["result"]["type"], constants.URL)
self.assertEqual(package_results[0]["technique"], constants.TECHNIQUE_CODE_CONFIG_PARSER)
@@ -160,15 +172,21 @@ def test_parse_flux_project_toml(self):
"""Test parsing Flux's Project.toml (Julia) file"""
result = Result()
+ # metadata_result = parse_toml_file(
+ # self.flux_project_path,
+ # result,
+ # "http://example.com/repo2/Project.toml"
+ # )
metadata_result = parse_toml_file(
self.flux_project_path,
result,
- "http://example.com/repo2/Project.toml"
+ self.flux_project_path
)
package_results = metadata_result.results.get(constants.CAT_HAS_PACKAGE_FILE, [])
self.assertTrue(len(package_results) > 0, "No package file info found")
- self.assertEqual(package_results[0]["result"]["value"], "Project.toml")
+ # self.assertEqual(package_results[0]["result"]["value"], "Project.toml")
+ self.assertEqual(package_results[0]["result"]["value"], self.flux_project_path)
package_id_results = metadata_result.results.get(constants.CAT_PACKAGE_ID, [])
self.assertTrue(len(package_id_results) > 0, "No package ID found")
diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py
index 1775da1a..14ab832e 100644
--- a/src/somef/utils/constants.py
+++ b/src/somef/utils/constants.py
@@ -54,12 +54,29 @@
REGEXP_READTHEDOCS_MD = (
r"\(\s*(https?://[^\s\)]+\.readthedocs\.io[^\s\)]*)\s*\)"
)
+
# REGEXP_READTHEDOCS_HTML = (
-# r"]+href=['\"](https?://[^\s\"']+\.readthedocs\.io[^\s\"']*)['\"][^>]*>"
-# r"(?:\s|<[^>]+>)*"
-# r"
]+src=['\"]https?://readthedocs\.org/projects/[^\s/]+/badge/[^\s\"']*"
+# r"]+?href=['\"](https?://[^'\"\s]+?)['\"][^>]*?>"
+# r"(?:(?!)[\s\S])*?"
+# r"
]+?src=['\"]https?://(?:readthedocs\.org/projects/|img\.shields\.io/pypi/)[^'\"\s]*"
# )
-REGEXP_READTHEDOCS_HTML = ( r"]+href=['\"](https?://[^\s\"']+)['\"][^>]*>" r"(?:\s|<[^>]+>)*" r"
]+src=['\"]https?://readthedocs\.org/projects/[^\s/]+/badge/[^\s\"']*" )
+
+REGEXP_READTHEDOCS_HTML = (r"""
+ ]*\bhref=['"](https?://[^'"\s]+)['"]
+ [^>]*>
+ (?:
+ [^<]+
+ |
+ <(?!/a\b)[^>]*>
+ )*
+
]*\bsrc=['"]
+ https?://(?:readthedocs\.org/projects/|img\.shields\.io/pypi/)
+ [^'"\s>]+
+ ['"]
+ """
+)
# For natural language citation
REGEXP_DOI_NATURAL = r'10\.\d{4,9}/[-._;()/:A-Za-z0-9]+'
REGEXP_YEAR_NATURAL = r'\b(19|20)\d{2}\b'