From 4df5aadfba39e8945a6c0143dba33c5001d8aef0 Mon Sep 17 00:00:00 2001 From: arunkannawadi Date: Sat, 17 Jun 2023 19:17:13 -0400 Subject: [PATCH 01/32] Update the proxy keys in _get_webdriver routines to reflect the changes that happened when we moved to httpx. Fixes an issue reported in #498. --- scholarly/_proxy_generator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scholarly/_proxy_generator.py b/scholarly/_proxy_generator.py index 49d5bd5..d296d90 100644 --- a/scholarly/_proxy_generator.py +++ b/scholarly/_proxy_generator.py @@ -365,8 +365,8 @@ def _get_webdriver(self): def _get_chrome_webdriver(self): if self._proxy_works: webdriver.DesiredCapabilities.CHROME['proxy'] = { - "httpProxy": self._proxies['http'], - "sslProxy": self._proxies['https'], + "httpProxy": self._proxies['http://'], + "sslProxy": self._proxies['https://'], "proxyType": "MANUAL" } @@ -381,8 +381,8 @@ def _get_firefox_webdriver(self): if self._proxy_works: # Redirect webdriver through proxy webdriver.DesiredCapabilities.FIREFOX['proxy'] = { - "httpProxy": self._proxies['http'], - "sslProxy": self._proxies['https'], + "httpProxy": self._proxies['http://'], + "sslProxy": self._proxies['https://'], "proxyType": "MANUAL", } From cd260d666dfa2589107709f2c2f06d98caa2957b Mon Sep 17 00:00:00 2001 From: arunkannawadi Date: Sat, 17 Jun 2023 19:29:15 -0400 Subject: [PATCH 02/32] Stop prepending proxy with http if it is socks Raised in #498. --- scholarly/_proxy_generator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scholarly/_proxy_generator.py b/scholarly/_proxy_generator.py index d296d90..fb6ac4f 100644 --- a/scholarly/_proxy_generator.py +++ b/scholarly/_proxy_generator.py @@ -189,11 +189,11 @@ def _use_proxy(self, http: str, https: str = None) -> bool: :returns: whether or not the proxy was set up successfully :rtype: {bool} """ - if http[:4] != "http": + if http[:4] not in ("http", "sock"): http = "http://" + http if https is None: https = http - elif https[:5] != "https": + elif https[:5] not in ("https", "socks"): https = "https://" + https proxies = {'http://': http, 'https://': https} From d6d03f2e1e7431790fda309ab7038153766ee6a2 Mon Sep 17 00:00:00 2001 From: arunkannawadi Date: Sat, 17 Jun 2023 22:55:15 -0400 Subject: [PATCH 03/32] Add a pub_date field to the bib dictionary --- scholarly/publication_parser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scholarly/publication_parser.py b/scholarly/publication_parser.py index fa58cbf..46b52ee 100644 --- a/scholarly/publication_parser.py +++ b/scholarly/publication_parser.py @@ -313,6 +313,7 @@ def fill(self, publication: Publication)->Publication: 'YYYY/M/D', 'YYYY/MM/D'] publication['bib']['pub_year'] = arrow.get(val.text, patterns).year + publication['bib']['pub_date'] = val.text elif key == 'description': # try to find all the gsh_csp if they exist abstract = val.find_all(class_='gsh_csp') From 9813d3934d243132af396debfad4247e20b9946a Mon Sep 17 00:00:00 2001 From: arunkannawadi Date: Sat, 17 Jun 2023 23:50:41 -0400 Subject: [PATCH 04/32] Update tags to get public access of publications --- scholarly/author_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scholarly/author_parser.py b/scholarly/author_parser.py index d46038e..9a9df53 100644 --- a/scholarly/author_parser.py +++ b/scholarly/author_parser.py @@ -152,14 +152,14 @@ def _fill_public_access(self, soup, author): while True: rows = soup.find_all('div', 'gsc_mnd_sec_na') if rows: - for row in rows[0].find_all('a', 'gsc_mnd_art_rvw gs_nph gsc_mnd_link_font'): + for row in rows[0].find_all('a', 'gsc_mnd_art_rvw gsc_mnd_link_font'): author_pub_id = re.findall(r"citation_for_view=([\w:-]*)", row['data-href'])[0] publications[author_pub_id]["public_access"] = False rows = soup.find_all('div', 'gsc_mnd_sec_avl') if rows: - for row in rows[0].find_all('a', 'gsc_mnd_art_rvw gs_nph gsc_mnd_link_font'): + for row in rows[0].find_all('a', 'gsc_mnd_art_rvw gsc_mnd_link_font'): author_pub_id = re.findall(r"citation_for_view=([\w:-]*)", row['data-href'])[0] publications[author_pub_id]["public_access"] = True From 63cd33abe1d0459e91cfbf011b8f3346132cffe4 Mon Sep 17 00:00:00 2001 From: arunkannawadi Date: Sat, 17 Jun 2023 23:51:17 -0400 Subject: [PATCH 05/32] Account for one version of mandate be cached in tests --- test_module.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test_module.py b/test_module.py index 0effc0f..3259ddc 100644 --- a/test_module.py +++ b/test_module.py @@ -212,15 +212,16 @@ def test_search_author_single_author(self): sum(pub.get('public_access', None) is True for pub in author['publications'])) self.assertEqual(author['public_access']['not_available'], sum(pub.get('public_access', None) is False for pub in author['publications'])) - pub = author['publications'][2] + pub = author['publications'][1] self.assertEqual(pub['author_pub_id'], u'4bahYMkAAAAJ:LI9QrySNdTsC') self.assertTrue('5738786554683183717' in pub['cites_id']) scholarly.fill(pub) + self.assertEqual(pub['pub_url'], "https://dl.acm.org/doi/abs/10.1145/3130800.3130815") mandate = Mandate(agency="US National Science Foundation", effective_date="2016/1", embargo="12 months", url_policy="https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf", url_policy_cached="/mandates/nsf-2021-02-13.pdf", grant="BCS-1354029") - self.assertIn(mandate, pub['mandates']) + self.assertIn(mandate['agency'], [_mandate['agency'] for _mandate in pub['mandates']]) # Trigger the pprint method, but suppress the output with self.suppress_stdout(): scholarly.pprint(author) From ecbc2133d9e38e2e468525f243b31726506deb9a Mon Sep 17 00:00:00 2001 From: arunkannawadi Date: Sat, 17 Jun 2023 23:51:37 -0400 Subject: [PATCH 06/32] Decrease the coauthor count to make the test pass The decrease is real, verified by manual counting. --- test_module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_module.py b/test_module.py index 3259ddc..2bfbe31 100644 --- a/test_module.py +++ b/test_module.py @@ -229,7 +229,7 @@ def test_search_author_single_author(self): # Check for the complete list of coauthors self.assertGreaterEqual(len(author['coauthors']), 20) if len(author['coauthors']) > 20: - self.assertGreaterEqual(len(author['coauthors']), 36) + self.assertGreaterEqual(len(author['coauthors']), 35) self.assertTrue('I23YUh8AAAAJ' in [_coauth['scholar_id'] for _coauth in author['coauthors']]) def test_search_author_multiple_authors(self): From 2d09680ab66b611796b6246c02b830c62a442e40 Mon Sep 17 00:00:00 2001 From: arunkannawadi Date: Sun, 18 Jun 2023 01:06:52 -0400 Subject: [PATCH 07/32] Add url entry to bibtex Addresses #499. --- scholarly/publication_parser.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scholarly/publication_parser.py b/scholarly/publication_parser.py index 46b52ee..5d7bf37 100644 --- a/scholarly/publication_parser.py +++ b/scholarly/publication_parser.py @@ -402,6 +402,11 @@ def bibtex(self, publication: Publication) -> str: publication = self.fill(publication) a = BibDatabase() converted_dict = publication['bib'] + try: + url = publication['eprint_url'] + except KeyError: + url = publication.get('pub_url', '') + converted_dict['url'] = url converted_dict = remap_bib(converted_dict, _BIB_REVERSE_MAPPING) str_dict = {key: str(value) for key, value in converted_dict.items()} # convert every key of the dictionary to string to be Bibtex compatible From fe98eb162d6eba0110974e989f115a51f493c5f4 Mon Sep 17 00:00:00 2001 From: arunkannawadi Date: Sun, 18 Jun 2023 01:08:42 -0400 Subject: [PATCH 08/32] Fix bibtex unittest --- test_module.py | 25 +++++-------------------- testdata/test_bibtex_result.txt | 11 +++++++++++ 2 files changed, 16 insertions(+), 20 deletions(-) create mode 100644 testdata/test_bibtex_result.txt diff --git a/test_module.py b/test_module.py index 2bfbe31..329eef6 100644 --- a/test_module.py +++ b/test_module.py @@ -685,33 +685,18 @@ def test_search_pubs_citedby_id(self): pubs = [p for p in scholarly.search_citedby(publication_id)] self.assertGreaterEqual(len(pubs), 11) - @unittest.skip(reason="The BiBTeX comparison is not reliable") def test_bibtex(self): """ Test that we get the BiBTeX entry correctly """ - expected_result = \ - ("""@inproceedings{ester1996density, - abstract = {Clustering algorithms are attractive for the task of class identification in spatial databases. """ - """However, the application to large spatial databases rises the following requirements for clustering algorithms: """ - """minimal requirements of domain knowledge to determine the input}, - author = {Ester, Martin and Kriegel, Hans-Peter and Sander, J{\\"o}rg and Xu, Xiaowei and others}, - booktitle = {kdd}, - number = {34}, - pages = {226--231}, - pub_year = {1996}, - title = {A density-based algorithm for discovering clusters in large spatial databases with noise.}, - venue = {kdd}, - volume = {96} - } + with open("testdata/bibtex.txt", "r") as f: + expected_result = "".join(f.readlines()) - """ - ) - pub = scholarly.search_single_pub("A density-based algorithm for discovering clusters in large " - "spatial databases with noise", filled=True) + pub = scholarly.search_single_pub("A distribution-based clustering algorithm for mining in large " + "spatial databases", filled=True) result = scholarly.bibtex(pub) - self.assertEqual(result, expected_result.replace("\n ", "\n")) + self.assertEqual(result, expected_result) def test_search_pubs(self): """ diff --git a/testdata/test_bibtex_result.txt b/testdata/test_bibtex_result.txt new file mode 100644 index 0000000..be925ef --- /dev/null +++ b/testdata/test_bibtex_result.txt @@ -0,0 +1,11 @@ +@inproceedings{xu1998distribution, + abstract = {The problem of detecting clusters of points belonging to a spatial point process arises in many applications. In this paper, we introduce the new clustering algorithm DBCLASD (Distribution-Based Clustering of LArge Spatial Databases) to discover clusters of this type. The results of experiments demonstrate that DBCLASD, contrary to partitioning algorithms such as CLARANS (Clustering Large Applications based on RANdomized Search), discovers clusters of arbitrary shape. Furthermore, DBCLASD does not require any input}, + author = {Xu, Xiaowei and Ester, Martin and Kriegel, H-P and Sander, J{\"o}rg}, + booktitle = {Proceedings 14th International Conference on Data Engineering}, + organization = {IEEE}, + pages = {324--331}, + pub_year = {1998}, + title = {A distribution-based clustering algorithm for mining in large spatial databases}, + url = {https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=933cba585a12e56a8f60511ebeb74b8cb42634b1}, + venue = {… Conference on Data …} +} From 6caabb291214807935cce938f0ef4463a01d3c6c Mon Sep 17 00:00:00 2001 From: arunkannawadi Date: Sat, 17 Jun 2023 18:35:54 -0400 Subject: [PATCH 09/32] Update CITATION version to 1.7.11 Fixes #501 --- CITATION.cff | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CITATION.cff b/CITATION.cff index 01a09dc..3980c50 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -52,4 +52,4 @@ keywords: citation-index scholarly-articles citation-analysis scholar googlescholar license: Unlicense -version: 1.5.0 +version: 1.7.11 From 91bada0e02c69710c25429811a7698d44eca72af Mon Sep 17 00:00:00 2001 From: Ji Ma <11808231+ma-ji@users.noreply.github.com> Date: Tue, 11 Jul 2023 15:09:09 -0500 Subject: [PATCH 10/32] proxy format conflict resolve conflict between proxy format: HTTPX and Requests --- scholarly/_proxy_generator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scholarly/_proxy_generator.py b/scholarly/_proxy_generator.py index fb6ac4f..01672d1 100644 --- a/scholarly/_proxy_generator.py +++ b/scholarly/_proxy_generator.py @@ -136,7 +136,8 @@ def _check_proxy(self, proxies) -> bool: :rtype: {bool} """ with requests.Session() as session: - session.proxies = proxies + # Reformat proxy for requests. Requests and HTTPX use different proxy format. + session.proxies = {'http':proxies['http://'], 'https':proxies['https://']} try: resp = session.get("http://httpbin.org/ip", timeout=self._TIMEOUT) if resp.status_code == 200: @@ -189,6 +190,7 @@ def _use_proxy(self, http: str, https: str = None) -> bool: :returns: whether or not the proxy was set up successfully :rtype: {bool} """ + # Reformat proxy for HTTPX if http[:4] not in ("http", "sock"): http = "http://" + http if https is None: @@ -521,6 +523,7 @@ def _fp_coroutine(self, timeout=1, wait_time=120): proxies = {'http://': proxy, 'https://': proxy} proxy_works = self._check_proxy(proxies) if proxy_works: + print(proxies) dirty_proxy = (yield proxy) t1 = time.time() else: From 03f063e8a9a97bb16739cfb23cb9ba5b25301d6b Mon Sep 17 00:00:00 2001 From: Ji Ma <11808231+ma-ji@users.noreply.github.com> Date: Tue, 11 Jul 2023 15:10:41 -0500 Subject: [PATCH 11/32] del debug print --- scholarly/_proxy_generator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scholarly/_proxy_generator.py b/scholarly/_proxy_generator.py index 01672d1..9967ed2 100644 --- a/scholarly/_proxy_generator.py +++ b/scholarly/_proxy_generator.py @@ -523,7 +523,6 @@ def _fp_coroutine(self, timeout=1, wait_time=120): proxies = {'http://': proxy, 'https://': proxy} proxy_works = self._check_proxy(proxies) if proxy_works: - print(proxies) dirty_proxy = (yield proxy) t1 = time.time() else: From c7d4737d03cdb40bfa82d6a6f98a2a65efb6914a Mon Sep 17 00:00:00 2001 From: Melroy van den Berg Date: Thu, 10 Aug 2023 23:49:50 +0200 Subject: [PATCH 12/32] Update requirements.txt Package name changed from `fake_useragent` to `fake-useragent`. Disclaimer: I'm the maintainer. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5b14200..a9b1021 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ arrow beautifulsoup4 bibtexparser deprecated -fake_useragent +fake-useragent free-proxy httpx python-dotenv From 6ddccd71a9043050351f8cbe5513a82f36fb10e2 Mon Sep 17 00:00:00 2001 From: "David V. Lu" Date: Tue, 24 Oct 2023 15:12:46 -0400 Subject: [PATCH 13/32] Update citations by year data --- test_module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_module.py b/test_module.py index 329eef6..3859df3 100644 --- a/test_module.py +++ b/test_module.py @@ -571,7 +571,7 @@ def test_cites_per_year(self): """ author = scholarly.search_author_id('DW_bVcEAAAAJ') scholarly.fill(author, sections=['counts']) - cpy = {2014: 1, 2015: 2, 2016: 2, 2017: 0, 2018: 2, 2019: 1, 2020: 12, 2021: 21, 2022: 35} + cpy = {2014: 1, 2015: 2, 2016: 2, 2017: 0, 2018: 2, 2019: 0, 2020: 11, 2021: 21, 2022: 37, 2023: 27} for year, count in cpy.items(): self.assertEqual(author['cites_per_year'][year], count) From 9f194525fbfd4d3460137e4d522c8fe8e78cd982 Mon Sep 17 00:00:00 2001 From: "David V. Lu" Date: Tue, 24 Oct 2023 15:19:32 -0400 Subject: [PATCH 14/32] Results from running codespell --- CHANGELOG.md | 6 +++--- README.md | 2 +- scholarly/_proxy_generator.py | 10 +++++----- scholarly/_scholarly.py | 2 +- scholarly/author_parser.py | 4 ++-- scholarly/data_types.py | 4 ++-- test_module.py | 4 ++-- 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b5932ad..1bfbb0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ ### Bugfixes - Fix pprint failures on Windows #413. - Thoroughly handle 1000 or more publications that are available (or not) according to public access mandates #414. -- Fix errors in `download_mandates_csv` that may occassionally occur for agencies without a policy link #413. +- Fix errors in `download_mandates_csv` that may occasionally occur for agencies without a policy link #413. ## Changes in v1.6.3 @@ -35,7 +35,7 @@ ### Features - Download table of funding agencies as a CSV file with URL to the funding mandates included -- Downlad top-ranking journals in general, under sub-categories and in different languages as a CSV file +- Download top-ranking journals in general, under sub-categories and in different languages as a CSV file ### Bugfixes - #392 @@ -58,7 +58,7 @@ ## Changes in v1.5.0 ### Features - Fetch the public access mandates information from a Scholar profile and mark the publications whether or not they satisfy the open-access mandate. -- Fetch an author's organization identifer from their Scholar profile +- Fetch an author's organization identifier from their Scholar profile - Search for all authors affiliated with an organization - Fetch homepage URL from a Scholar profile ### Enhancements diff --git a/README.md b/README.md index 88ddfb2..d1ac442 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ This means your code that uses an earlier version of `scholarly` is guaranteed t ## Tests -To check if your installation is succesful, run the tests by executing the `test_module.py` file as: +To check if your installation is successful, run the tests by executing the `test_module.py` file as: ```bash python3 test_module diff --git a/scholarly/_proxy_generator.py b/scholarly/_proxy_generator.py index 9967ed2..2d2ec6f 100644 --- a/scholarly/_proxy_generator.py +++ b/scholarly/_proxy_generator.py @@ -109,7 +109,7 @@ def SingleProxy(self, http=None, https=None): :param http: http proxy address :type http: string - :param https: https proxy adress + :param https: https proxy address :type https: string :returns: whether or not the proxy was set up successfully :rtype: {bool} @@ -117,7 +117,7 @@ def SingleProxy(self, http=None, https=None): :Example:: >>> pg = ProxyGenerator() - >>> success = pg.SingleProxy(http = , https = ) + >>> success = pg.SingleProxy(http = , https = ) """ self.logger.info("Enabling proxies: http=%s https=%s", http, https) proxy_works = self._use_proxy(http=http, https=https) @@ -162,7 +162,7 @@ def _check_proxy(self, proxies) -> bool: def _refresh_tor_id(self, tor_control_port: int, password: str) -> bool: """Refreshes the id by using a new Tor node. - :returns: Whether or not the refresh was succesful + :returns: Whether or not the refresh was successful :rtype: {bool} """ try: @@ -434,7 +434,7 @@ def _handle_captcha2(self, url): self.logger.info("Google thinks we are DOSing the captcha.") raise e except (WebDriverException) as e: - self.logger.info("Browser seems to be disfunctional - closed by user?") + self.logger.info("Browser seems to be dysfunctional - closed by user?") raise e except Exception as e: # TODO: This exception handler should eventually be removed when @@ -500,7 +500,7 @@ def _close_session(self): self.logger.warning("Could not close webdriver cleanly: %s", e) def _fp_coroutine(self, timeout=1, wait_time=120): - """A coroutine to continuosly yield free proxies + """A coroutine to continuously yield free proxies It takes back the proxies that stopped working and marks it as dirty. """ diff --git a/scholarly/_scholarly.py b/scholarly/_scholarly.py index f0162dc..4f64f51 100644 --- a/scholarly/_scholarly.py +++ b/scholarly/_scholarly.py @@ -428,7 +428,7 @@ def search_pubs_custom_url(self, url: str)->_SearchScholarIterator: parameters in the Advanced Search dialog box and then use the URL here to programmatically fetch the results. - :param url: custom url to seach for the publication + :param url: custom url to search for the publication :type url: string """ return self.__nav.search_publications(url) diff --git a/scholarly/author_parser.py b/scholarly/author_parser.py index 9a9df53..4516b80 100644 --- a/scholarly/author_parser.py +++ b/scholarly/author_parser.py @@ -222,7 +222,7 @@ def _get_coauthors_short(self, soup): def _get_coauthors_long(self, author): """Get the long (>20) list of coauthors. - This method fetches the complete list of coauthors bu opening a new + This method fetches the complete list of coauthors by opening a new page filled with the complete coauthor list. Note: @@ -283,7 +283,7 @@ def fill(self, author, sections: list = [], sortby="citedby", publication_limit: :type sortby: string :param publication_limit: Select the max number of publications you want you want to fill for the author. Defaults to no limit. :type publication_limit: int - :returns: The filled object if fill was successfull, False otherwise. + :returns: The filled object if fill was successful, False otherwise. :rtype: Author or bool :Example:: diff --git a/scholarly/data_types.py b/scholarly/data_types.py index dade6ba..13a9a38 100644 --- a/scholarly/data_types.py +++ b/scholarly/data_types.py @@ -49,7 +49,7 @@ class PublicationSource(str, Enum): We also have publications that appear in the "author pages" of Google Scholar. These publications are often a set of publications "merged" together. - The snippet version of these publications conains the title of the publication, + The snippet version of these publications contains the title of the publication, a subset of the authors, the (sometimes truncated) venue, and the year of the publication and the number of papers that cite the publication. @@ -183,7 +183,7 @@ class Publication(TypedDict, total=False): the "citedby_url" will be a comma-separated list of values. It is also used to return the "cluster" of all the different versions of the paper. https://scholar.google.com/scholar?cluster=16766804411681372720&hl=en - :param cites_per_year: a dictionay containing the number of citations per year for this Publication + :param cites_per_year: a dictionary containing the number of citations per year for this Publication (source: AUTHOR_PUBLICATION_ENTRY) :param eprint_url: digital version of the Publication. Usually it is a pdf. :param pub_url: url of the website providing the publication diff --git a/test_module.py b/test_module.py index 329eef6..1f6b38c 100644 --- a/test_module.py +++ b/test_module.py @@ -244,7 +244,7 @@ def test_search_author_multiple_authors(self): def test_search_author_id(self): """ Test the search by author ID. Marie Skłodowska-Curie's ID is - EmD_lTEAAAAJ and these IDs are permenant + EmD_lTEAAAAJ and these IDs are permanent """ author = scholarly.search_author_id('EmD_lTEAAAAJ') self.assertEqual(author['name'], u'Marie Skłodowska-Curie') @@ -254,7 +254,7 @@ def test_search_author_id(self): def test_search_author_id_filled(self): """ Test the search by author ID. Marie Skłodowska-Curie's ID is - EmD_lTEAAAAJ and these IDs are permenant. + EmD_lTEAAAAJ and these IDs are permanent. As of July 2020, Marie Skłodowska-Curie has 1963 citations on Google Scholar and 179 publications """ From 7a4da4b090600a112654aea9be07c7d83adb81ce Mon Sep 17 00:00:00 2001 From: Arun Kannawadi Date: Sun, 5 Nov 2023 12:48:56 -0500 Subject: [PATCH 15/32] Remove 2023 values from test_cites_per_year --- test_module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_module.py b/test_module.py index 3859df3..f08f6e3 100644 --- a/test_module.py +++ b/test_module.py @@ -571,7 +571,7 @@ def test_cites_per_year(self): """ author = scholarly.search_author_id('DW_bVcEAAAAJ') scholarly.fill(author, sections=['counts']) - cpy = {2014: 1, 2015: 2, 2016: 2, 2017: 0, 2018: 2, 2019: 0, 2020: 11, 2021: 21, 2022: 37, 2023: 27} + cpy = {2014: 1, 2015: 2, 2016: 2, 2017: 0, 2018: 2, 2019: 0, 2020: 11, 2021: 21, 2022: 37} for year, count in cpy.items(): self.assertEqual(author['cites_per_year'][year], count) From 2af460eefdaa9d365e5e2b4fab11583fd1e6def9 Mon Sep 17 00:00:00 2001 From: Daniel Lebedinsky Date: Sun, 19 Nov 2023 19:56:03 -0500 Subject: [PATCH 16/32] Fixed test_bibtex unit test, updated CONTRIBUTING.md --- .github/CONTRIBUTING.md | 13 +++++++------ test_module.py | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 9f80eb1..abace07 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -16,12 +16,13 @@ Additionally, if you are interesting in contributing to the codebase, submit a p ## How to contribute -1. Create a fork of `scholarly-python-package/scholarly` repository. -2. If you add a new feature, try to include tests in already existing test cases, or create a new test case if that is not possible. -3. Make sure the unit tests pass before raising a PR. For all the unit tests to pass, you typically need to setup a premium proxy service such as `ScraperAPI` or `Luminati` (`Bright Data`). If you do not have an account, you may try to use `FreeProxy`. Without a proxy, 6 out of 17 test cases will be skipped. -4. Check that the documentatation is consistent with the code. Check that the documentation builds successfully. -5. Submit a PR, with `develop` as your base branch. -6. After an initial code review by the maintainers, the unit tests will be run with the `ScraperAPI` key stored in the Github repository. Passing all tests cases is necessary before merging your PR. +1. Create a fork of `scholarly-python-package/scholarly` repository. Make sure that "Copy the main branch only" is **not** checked off. +2. After cloning your fork and checking out into the develop branch, run `python setup.py --help-commands` for more info on how to install dependencies and build. You may need to run it with `sudo`. +3. If you add a new feature, try to include tests in already existing test cases, or create a new test case if that is not possible. For a comprehensive output, run `python -m unittest -v test_module.py` +4. Make sure the unit tests pass before raising a PR. For all the unit tests to pass, you typically need to setup a premium proxy service such as `ScraperAPI` or `Luminati` (`Bright Data`). By default, `python setup.py install` will get `FreeProxy`. Without a proxy, 6 out of 17 test cases will be skipped. +5. Check that the documentatation is consistent with the code. Check that the documentation builds successfully. +6. Submit a PR, with `develop` as your base branch. +7. After an initial code review by the maintainers, the unit tests will be run with the `ScraperAPI` key stored in the Github repository. Passing all tests cases is necessary before merging your PR. ## Build Docs diff --git a/test_module.py b/test_module.py index b2a3ac5..a0b5ac7 100644 --- a/test_module.py +++ b/test_module.py @@ -690,7 +690,7 @@ def test_bibtex(self): Test that we get the BiBTeX entry correctly """ - with open("testdata/bibtex.txt", "r") as f: + with open("testdata/test_bibtex_result.txt", "r") as f: expected_result = "".join(f.readlines()) pub = scholarly.search_single_pub("A distribution-based clustering algorithm for mining in large " From ba3b8a4fb56d72fcf7ff89208021d679cae12b51 Mon Sep 17 00:00:00 2001 From: Daniel Lebedinsky Date: Wed, 29 Nov 2023 00:11:55 -0500 Subject: [PATCH 17/32] Added test for FreeProxy --- test_module.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/test_module.py b/test_module.py index a0b5ac7..43c8b5a 100644 --- a/test_module.py +++ b/test_module.py @@ -78,6 +78,20 @@ def test_tor_launch_own_process(self): authors = [a for a in scholarly.search_author(query)] self.assertGreaterEqual(len(authors), 1) +class TestFreeProxy(unittest.TestCase): + luminati = os.getenv("USERNAME") and os.getenv("PASSWORD") and os.getenv("PORT") + scraperAPI = os.getenv('SCRAPER_API_KEY') + skipIf = (luminati is not None) or (scraperAPI is not None) + + @unittest.skipIf(skipIf, reason="Other proxy is being used") + def test_freeproxy(self): + """ + Test that we can set up FreeProxy successfully + """ + proxy_generator = ProxyGenerator() + success = proxy_generator.FreeProxies() + self.assertTrue(success) + self.assertEqual(proxy_generator.proxy_mode, "FREE_PROXIES") class TestScholarly(unittest.TestCase): From 3b5a2e8a9b73028631b21cf9d5c70d8ad16b4121 Mon Sep 17 00:00:00 2001 From: keko24 Date: Mon, 24 Jun 2024 11:16:35 +0200 Subject: [PATCH 18/32] Fixed an issue where search_pubs doesn't find a publication when only a single publication exists for the query. Added a unit test for search_pubs that tests for the previous problem. --- scholarly/publication_parser.py | 2 +- test_module.py | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/scholarly/publication_parser.py b/scholarly/publication_parser.py index 5d7bf37..60d4769 100644 --- a/scholarly/publication_parser.py +++ b/scholarly/publication_parser.py @@ -58,7 +58,7 @@ def _load_url(self, url: str): # this is temporary until setup json file self._soup = self._nav._get_soup(url) self._pos = 0 - self._rows = self._soup.find_all('div', class_='gs_r gs_or gs_scl') + self._soup.find_all('div', class_='gsc_mpat_ttl') + self._rows = self._soup.find_all('div', class_='gs_r gs_or gs_scl') + self._soup.find_all('div', class_='gs_r gs_or gs_scl gs_fmar') + self._soup.find_all('div', class_='gsc_mpat_ttl') def _get_total_results(self): if self._soup.find("div", class_="gs_pda"): diff --git a/test_module.py b/test_module.py index b2a3ac5..19c688c 100644 --- a/test_module.py +++ b/test_module.py @@ -653,7 +653,7 @@ def test_search_pubs_empty_publication(self): """ Test that searching for an empty publication returns zero results """ - pubs = [p for p in scholarly.search_pubs('')] + pubs = [p for p in scholarly.search_pubs('Perception of physical stability and center of mass of 3D objects')] self.assertIs(len(pubs), 0) def test_search_pubs_citedby(self): @@ -718,6 +718,23 @@ def test_search_pubs(self): titles = [p['bib']['title'] for p in pubs] self.assertIn('Visual perception of the physical stability of asymmetric three-dimensional objects', titles) + def test_search_pubs_single_pub(self): + """ + As of Jun 24, 2024 there are is only one pub that fits the search term: + [Perception of physical stability and center of mass of 3D objects]. + + Check that it returns a proper result and the total results for that search term is equal to 1. + """ + pub = scholarly.search_single_pub("Perception of physical stability and center of mass of 3D objects") + pubs = list(scholarly.search_pubs("Perception of physical stability and center of mass of 3D objects")) + # Check that the first entry in pubs is the same as pub. + # Checking for quality holds for non-dict entries only. + for key in {'author_id', 'pub_url', 'num_citations'}: + self.assertEqual(pub[key], pubs[0][key]) + for key in {'title', 'pub_year', 'venue'}: + self.assertEqual(pub['bib'][key], pubs[0]['bib'][key]) + self.assertEqual(len(pubs), 1) + def test_search_pubs_total_results(self): """ As of September 16, 2021 there are 32 pubs that fit the search term: From 0db2befd2a7f1f500a9433010f516d86ebd63e3c Mon Sep 17 00:00:00 2001 From: keko24 Date: Mon, 24 Jun 2024 11:42:31 +0200 Subject: [PATCH 19/32] Fixed total_results returning 0 when only a single publication exists. --- scholarly/publication_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scholarly/publication_parser.py b/scholarly/publication_parser.py index 60d4769..5d7e728 100644 --- a/scholarly/publication_parser.py +++ b/scholarly/publication_parser.py @@ -70,7 +70,7 @@ def _get_total_results(self): match = re.match(pattern=r'(^|\s*About)\s*([0-9,\.\s’]+)', string=x.text) if match: return int(re.sub(pattern=r'[,\.\s’]',repl='', string=match.group(2))) - return 0 + return len(self._rows) # Iterator protocol From 2cd59b3b8a3e5c10f4bb2fee15f5ea9d5363534a Mon Sep 17 00:00:00 2001 From: Andrej <56741017+keko24@users.noreply.github.com> Date: Tue, 25 Jun 2024 11:58:24 +0200 Subject: [PATCH 20/32] Removed the string in search_pubs in test_search_empty_publication. --- test_module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_module.py b/test_module.py index 19c688c..bcd93e4 100644 --- a/test_module.py +++ b/test_module.py @@ -653,7 +653,7 @@ def test_search_pubs_empty_publication(self): """ Test that searching for an empty publication returns zero results """ - pubs = [p for p in scholarly.search_pubs('Perception of physical stability and center of mass of 3D objects')] + pubs = [p for p in scholarly.search_pubs('')] self.assertIs(len(pubs), 0) def test_search_pubs_citedby(self): From 0765945fca6a933a508bd80bb87d746bcc620e61 Mon Sep 17 00:00:00 2001 From: Daniel Nisnevich Date: Sun, 15 Sep 2024 09:28:18 +0300 Subject: [PATCH 21/32] Update publication_parser.py changed line 61 --- scholarly/publication_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scholarly/publication_parser.py b/scholarly/publication_parser.py index fa58cbf..8997d8f 100644 --- a/scholarly/publication_parser.py +++ b/scholarly/publication_parser.py @@ -58,7 +58,7 @@ def _load_url(self, url: str): # this is temporary until setup json file self._soup = self._nav._get_soup(url) self._pos = 0 - self._rows = self._soup.find_all('div', class_='gs_r gs_or gs_scl') + self._soup.find_all('div', class_='gsc_mpat_ttl') + self._rows = self._soup.select("div.gs_r.gs_or.gs_scl") + self._soup.select("div.gsc_mpat_ttl") def _get_total_results(self): if self._soup.find("div", class_="gs_pda"): From 0324b9179fc2002132e73bd8ddfa40dfe4d6ab80 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 28 Oct 2024 15:14:46 -0400 Subject: [PATCH 22/32] Add github action to codespell develop on push and PRs --- .github/workflows/codespell.yml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 .github/workflows/codespell.yml diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml new file mode 100644 index 0000000..748abfb --- /dev/null +++ b/.github/workflows/codespell.yml @@ -0,0 +1,25 @@ +# Codespell configuration is within pyproject.toml +--- +name: Codespell + +on: + push: + branches: [develop] + pull_request: + branches: [develop] + +permissions: + contents: read + +jobs: + codespell: + name: Check for spelling errors + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Annotate locations with typos + uses: codespell-project/codespell-problem-matcher@v1 + - name: Codespell + uses: codespell-project/actions-codespell@v2 From c8bf96439cad6c0d63f3ea5f6b90b24edb8e95f8 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 28 Oct 2024 15:14:46 -0400 Subject: [PATCH 23/32] Add rudimentary codespell config --- pyproject.toml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 9787c3b..a58d63b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,10 @@ [build-system] requires = ["setuptools", "wheel"] build-backend = "setuptools.build_meta" + +[tool.codespell] +# Ref: https://github.com/codespell-project/codespell#using-a-config-file +skip = '.git*' +check-hidden = true +# ignore-regex = '' +# ignore-words-list = '' From 3e5ae3108d924141a8fec56a5df66ceccedeef03 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 28 Oct 2024 15:16:12 -0400 Subject: [PATCH 24/32] adjust skips --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a58d63b..52ed30a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,5 +6,5 @@ build-backend = "setuptools.build_meta" # Ref: https://github.com/codespell-project/codespell#using-a-config-file skip = '.git*' check-hidden = true -# ignore-regex = '' +ignore-regex = '\b(assertIn|Ewha Womans|citeseerx.ist.psu.edu\S*)\b' # ignore-words-list = '' From 16b5f89426ccdcbf2924cba9b5838d8c3508826c Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 28 Oct 2024 15:16:20 -0400 Subject: [PATCH 25/32] [DATALAD RUNCMD] run codespell throughout fixing few left typos automagically === Do not change lines below === { "chain": [], "cmd": "codespell -w", "exit": 0, "extra_inputs": [], "inputs": [], "outputs": [], "pwd": "." } ^^^ Do not change lines above ^^^ --- CODE_OF_CONDUCT.md | 2 +- scholarly/data_types.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f5b0e27..9d18efa 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -8,7 +8,7 @@ permalink: /coc.html We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender -identity and expression, level of experience, education, socio-economic status, +identity and expression, level of experience, education, socioeconomic status, nationality, personal appearance, race, religion, or sexual identity and orientation. diff --git a/scholarly/data_types.py b/scholarly/data_types.py index 13a9a38..d57b1ed 100644 --- a/scholarly/data_types.py +++ b/scholarly/data_types.py @@ -20,7 +20,7 @@ class PublicationSource(str, Enum): "PUBLICATION SEARCH SNIPPET". This form captures the publication when it appears as a "snippet" in - the context of the resuls of a publication search. For example: + the context of the results of a publication search. For example: Publication search: https://scholar.google.com/scholar?hl=en&q=adaptive+fraud+detection&btnG=&as_sdt=0%2C33 From a4e6c8d00f877493498b914bcedd19200f087e95 Mon Sep 17 00:00:00 2001 From: nkxxll Date: Sun, 29 Dec 2024 15:36:32 +0100 Subject: [PATCH 26/32] docs(quickstart): add conda to install option from github README --- docs/quickstart.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 73b5787..019a9ee 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -16,6 +16,12 @@ or use ``pip`` to install from github: pip install git+https://github.com/scholarly-python-package/scholarly.git +or use ``conda`` to install from ``conda-forge``: + +.. code:: bash + + conda install -c conda-forge scholarly + or clone the package using git: .. code:: bash From 1b065eed19eba793daf638211e28f339397d31c9 Mon Sep 17 00:00:00 2001 From: brokenjade3000 Date: Sat, 8 Feb 2025 14:27:55 -0700 Subject: [PATCH 27/32] The current httpx doesn't support proxies arguments: https://github.com/encode/httpx/blob/master/httpx/_client.py#L239 --- scholarly/_proxy_generator.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scholarly/_proxy_generator.py b/scholarly/_proxy_generator.py index 2d2ec6f..3b37883 100644 --- a/scholarly/_proxy_generator.py +++ b/scholarly/_proxy_generator.py @@ -485,6 +485,10 @@ def _new_session(self, **kwargs): # ScraperAPI requests to work. # https://www.scraperapi.com/documentation/ init_kwargs["verify"] = False + if 'proxies' in init_kwargs: + proxy=init_kwargs['proxies']['https://'] + del init_kwargs['proxies'] + init_kwargs['proxy'] = proxy self._session = httpx.Client(**init_kwargs) self._webdriver = None From 67dab6fabd35015c3206949baa388012e4c0883f Mon Sep 17 00:00:00 2001 From: Tobias Zimmermann <77075037+tZimmermann98@users.noreply.github.com> Date: Mon, 3 Feb 2025 11:40:51 +0100 Subject: [PATCH 28/32] Update publication_parser.py for arrow errors, fallback to regex year extraction --- scholarly/publication_parser.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scholarly/publication_parser.py b/scholarly/publication_parser.py index df54306..b023510 100644 --- a/scholarly/publication_parser.py +++ b/scholarly/publication_parser.py @@ -312,7 +312,11 @@ def fill(self, publication: Publication)->Publication: 'YYYY/M/DD', 'YYYY/M/D', 'YYYY/MM/D'] - publication['bib']['pub_year'] = arrow.get(val.text, patterns).year + try: + publication['bib']['pub_year'] = arrow.get(val.text, patterns).year + except ValueError: + # fallback to regex year extraction if arrow fails + publication['bib']['pub_year'] = re.search(r'\d{4}', val.text).group() publication['bib']['pub_date'] = val.text elif key == 'description': # try to find all the gsh_csp if they exist From db060439daab09a35bef78437594ebdf3fe7188d Mon Sep 17 00:00:00 2001 From: Tobias Zimmermann <77075037+tZimmermann98@users.noreply.github.com> Date: Mon, 3 Feb 2025 11:52:05 +0100 Subject: [PATCH 29/32] fallback to regex year extraction or empty String when arrow fails --- scholarly/publication_parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scholarly/publication_parser.py b/scholarly/publication_parser.py index b023510..ca3ca16 100644 --- a/scholarly/publication_parser.py +++ b/scholarly/publication_parser.py @@ -316,7 +316,8 @@ def fill(self, publication: Publication)->Publication: publication['bib']['pub_year'] = arrow.get(val.text, patterns).year except ValueError: # fallback to regex year extraction if arrow fails - publication['bib']['pub_year'] = re.search(r'\d{4}', val.text).group() + match = re.search(r'\d{4}', val.text) + publication['bib']['pub_year'] = match.group() if match else "" publication['bib']['pub_date'] = val.text elif key == 'description': # try to find all the gsh_csp if they exist From eecaae552bee6f031f9416604f5d1234550c240f Mon Sep 17 00:00:00 2001 From: L Date: Fri, 12 Apr 2024 09:46:49 +1000 Subject: [PATCH 30/32] Add in PDF link in publication fill --- scholarly/publication_parser.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scholarly/publication_parser.py b/scholarly/publication_parser.py index ca3ca16..3040f92 100644 --- a/scholarly/publication_parser.py +++ b/scholarly/publication_parser.py @@ -286,6 +286,10 @@ def fill(self, publication: Publication)->Publication: if soup.find('a', class_='gsc_oci_title_link'): publication['pub_url'] = soup.find( 'a', class_='gsc_oci_title_link')['href'] + if soup.find('div', class_='gsc_oci_title_ggi'): + link = soup.find('a', attrs={'data-clk': True}) + if link: + publication['pdf_url'] = link['href'] for item in soup.find_all('div', class_='gs_scl'): key = item.find(class_='gsc_oci_field').text.strip().lower() val = item.find(class_='gsc_oci_value') From 63f35925081be9040ead4dc8decc50e30f750cc8 Mon Sep 17 00:00:00 2001 From: L Date: Fri, 12 Apr 2024 10:11:32 +1000 Subject: [PATCH 31/32] Update tests and add in pdf url from search results --- scholarly/publication_parser.py | 4 ++++ test_module.py | 6 ++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/scholarly/publication_parser.py b/scholarly/publication_parser.py index 3040f92..cb205d5 100644 --- a/scholarly/publication_parser.py +++ b/scholarly/publication_parser.py @@ -202,6 +202,10 @@ def _scholar_pub(self, __data, publication: Publication): if title.find('a'): publication['pub_url'] = title.find('a')['href'] + pdf_div = __data.find('div', class_='gs_ggs gs_fl') + if pdf_div and pdf_div.find('a', href=True): + publication['pdf_url'] = pdf_div.find('a')['href'] + author_div_element = databox.find('div', class_='gs_a') authorinfo = author_div_element.text authorinfo = authorinfo.replace(u'\xa0', u' ') # NBSP diff --git a/test_module.py b/test_module.py index e54e1aa..153c790 100644 --- a/test_module.py +++ b/test_module.py @@ -724,7 +724,7 @@ def test_search_pubs(self): pubs = list(scholarly.search_pubs('"naive physics" stability "3d shape"')) # Check that the first entry in pubs is the same as pub. # Checking for quality holds for non-dict entries only. - for key in {'author_id', 'pub_url', 'num_citations'}: + for key in {'author_id', 'pub_url', 'pdf_url', 'num_citations'}: self.assertEqual(pub[key], pubs[0][key]) for key in {'title', 'pub_year', 'venue'}: self.assertEqual(pub['bib'][key], pubs[0]['bib'][key]) @@ -784,6 +784,7 @@ def test_search_pubs_filling_publication_contents(self): self.assertTrue(f['bib']['publisher'] == u'The Association for Research in Vision and Ophthalmology') self.assertTrue(f['bib']['title'] == u'Creating correct blur and its effect on accommodation') self.assertTrue(f['pub_url'] == u'https://jov.arvojournals.org/article.aspx?articleid=2701817') + self.assertTrue(f['pdf_url'] == u'https://jov.arvojournals.org/arvo/content_public/journal/jov/937491/i1534-7362-18-9-1.pdf') self.assertTrue(f['bib']['volume'] == '18') self.assertTrue(f['bib']['pub_year'] == u'2018') @@ -800,6 +801,7 @@ def test_related_articles_from_author(self): # Typically, the same publication is returned as the most related article same_article = next(related_articles) self.assertEqual(pub["pub_url"], same_article["pub_url"]) + self.assertEqual(pub["pdf_url"], same_article["pdf_url"]) for key in {'title', 'pub_year'}: self.assertEqual(str(pub['bib'][key]), (same_article['bib'][key])) @@ -818,7 +820,7 @@ def test_related_articles_from_publication(self): related_articles = scholarly.get_related_articles(pub) # Typically, the same publication is returned as the most related article same_article = next(related_articles) - for key in {'author_id', 'pub_url', 'num_citations'}: + for key in {'author_id', 'pub_url', 'pdf_url', 'num_citations'}: self.assertEqual(pub[key], same_article[key]) for key in {'title', 'pub_year'}: self.assertEqual(pub['bib'][key], same_article['bib'][key]) From 35f97d7bae60be438b125f7c3008ded2d0e929c1 Mon Sep 17 00:00:00 2001 From: L Date: Thu, 12 Dec 2024 12:30:11 +1000 Subject: [PATCH 32/32] Renamed "pdf_url" to "eprint_url" --- scholarly/publication_parser.py | 4 ++-- test_module.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/scholarly/publication_parser.py b/scholarly/publication_parser.py index cb205d5..e03113b 100644 --- a/scholarly/publication_parser.py +++ b/scholarly/publication_parser.py @@ -204,7 +204,7 @@ def _scholar_pub(self, __data, publication: Publication): pdf_div = __data.find('div', class_='gs_ggs gs_fl') if pdf_div and pdf_div.find('a', href=True): - publication['pdf_url'] = pdf_div.find('a')['href'] + publication['eprint_url'] = pdf_div.find('a')['href'] author_div_element = databox.find('div', class_='gs_a') authorinfo = author_div_element.text @@ -293,7 +293,7 @@ def fill(self, publication: Publication)->Publication: if soup.find('div', class_='gsc_oci_title_ggi'): link = soup.find('a', attrs={'data-clk': True}) if link: - publication['pdf_url'] = link['href'] + publication['eprint_url'] = link['href'] for item in soup.find_all('div', class_='gs_scl'): key = item.find(class_='gsc_oci_field').text.strip().lower() val = item.find(class_='gsc_oci_value') diff --git a/test_module.py b/test_module.py index 153c790..bf464e3 100644 --- a/test_module.py +++ b/test_module.py @@ -724,7 +724,7 @@ def test_search_pubs(self): pubs = list(scholarly.search_pubs('"naive physics" stability "3d shape"')) # Check that the first entry in pubs is the same as pub. # Checking for quality holds for non-dict entries only. - for key in {'author_id', 'pub_url', 'pdf_url', 'num_citations'}: + for key in {'author_id', 'pub_url', 'eprint_url', 'num_citations'}: self.assertEqual(pub[key], pubs[0][key]) for key in {'title', 'pub_year', 'venue'}: self.assertEqual(pub['bib'][key], pubs[0]['bib'][key]) @@ -784,7 +784,7 @@ def test_search_pubs_filling_publication_contents(self): self.assertTrue(f['bib']['publisher'] == u'The Association for Research in Vision and Ophthalmology') self.assertTrue(f['bib']['title'] == u'Creating correct blur and its effect on accommodation') self.assertTrue(f['pub_url'] == u'https://jov.arvojournals.org/article.aspx?articleid=2701817') - self.assertTrue(f['pdf_url'] == u'https://jov.arvojournals.org/arvo/content_public/journal/jov/937491/i1534-7362-18-9-1.pdf') + self.assertTrue(f['eprint_url'] == u'https://jov.arvojournals.org/arvo/content_public/journal/jov/937491/i1534-7362-18-9-1.pdf') self.assertTrue(f['bib']['volume'] == '18') self.assertTrue(f['bib']['pub_year'] == u'2018') @@ -801,7 +801,7 @@ def test_related_articles_from_author(self): # Typically, the same publication is returned as the most related article same_article = next(related_articles) self.assertEqual(pub["pub_url"], same_article["pub_url"]) - self.assertEqual(pub["pdf_url"], same_article["pdf_url"]) + self.assertEqual(pub["eprint_url"], same_article["eprint_url"]) for key in {'title', 'pub_year'}: self.assertEqual(str(pub['bib'][key]), (same_article['bib'][key])) @@ -820,7 +820,7 @@ def test_related_articles_from_publication(self): related_articles = scholarly.get_related_articles(pub) # Typically, the same publication is returned as the most related article same_article = next(related_articles) - for key in {'author_id', 'pub_url', 'pdf_url', 'num_citations'}: + for key in {'author_id', 'pub_url', 'eprint_url', 'num_citations'}: self.assertEqual(pub[key], same_article[key]) for key in {'title', 'pub_year'}: self.assertEqual(pub['bib'][key], same_article['bib'][key])