diff --git a/scholarly/publication_parser.py b/scholarly/publication_parser.py index ca3ca16..e03113b 100644 --- a/scholarly/publication_parser.py +++ b/scholarly/publication_parser.py @@ -202,6 +202,10 @@ def _scholar_pub(self, __data, publication: Publication): if title.find('a'): publication['pub_url'] = title.find('a')['href'] + pdf_div = __data.find('div', class_='gs_ggs gs_fl') + if pdf_div and pdf_div.find('a', href=True): + publication['eprint_url'] = pdf_div.find('a')['href'] + author_div_element = databox.find('div', class_='gs_a') authorinfo = author_div_element.text authorinfo = authorinfo.replace(u'\xa0', u' ') # NBSP @@ -286,6 +290,10 @@ def fill(self, publication: Publication)->Publication: if soup.find('a', class_='gsc_oci_title_link'): publication['pub_url'] = soup.find( 'a', class_='gsc_oci_title_link')['href'] + if soup.find('div', class_='gsc_oci_title_ggi'): + link = soup.find('a', attrs={'data-clk': True}) + if link: + publication['eprint_url'] = link['href'] for item in soup.find_all('div', class_='gs_scl'): key = item.find(class_='gsc_oci_field').text.strip().lower() val = item.find(class_='gsc_oci_value') diff --git a/test_module.py b/test_module.py index e54e1aa..bf464e3 100644 --- a/test_module.py +++ b/test_module.py @@ -724,7 +724,7 @@ def test_search_pubs(self): pubs = list(scholarly.search_pubs('"naive physics" stability "3d shape"')) # Check that the first entry in pubs is the same as pub. # Checking for quality holds for non-dict entries only. - for key in {'author_id', 'pub_url', 'num_citations'}: + for key in {'author_id', 'pub_url', 'eprint_url', 'num_citations'}: self.assertEqual(pub[key], pubs[0][key]) for key in {'title', 'pub_year', 'venue'}: self.assertEqual(pub['bib'][key], pubs[0]['bib'][key]) @@ -784,6 +784,7 @@ def test_search_pubs_filling_publication_contents(self): self.assertTrue(f['bib']['publisher'] == u'The Association for Research in Vision and Ophthalmology') self.assertTrue(f['bib']['title'] == u'Creating correct blur and its effect on accommodation') self.assertTrue(f['pub_url'] == u'https://jov.arvojournals.org/article.aspx?articleid=2701817') + self.assertTrue(f['eprint_url'] == u'https://jov.arvojournals.org/arvo/content_public/journal/jov/937491/i1534-7362-18-9-1.pdf') self.assertTrue(f['bib']['volume'] == '18') self.assertTrue(f['bib']['pub_year'] == u'2018') @@ -800,6 +801,7 @@ def test_related_articles_from_author(self): # Typically, the same publication is returned as the most related article same_article = next(related_articles) self.assertEqual(pub["pub_url"], same_article["pub_url"]) + self.assertEqual(pub["eprint_url"], same_article["eprint_url"]) for key in {'title', 'pub_year'}: self.assertEqual(str(pub['bib'][key]), (same_article['bib'][key])) @@ -818,7 +820,7 @@ def test_related_articles_from_publication(self): related_articles = scholarly.get_related_articles(pub) # Typically, the same publication is returned as the most related article same_article = next(related_articles) - for key in {'author_id', 'pub_url', 'num_citations'}: + for key in {'author_id', 'pub_url', 'eprint_url', 'num_citations'}: self.assertEqual(pub[key], same_article[key]) for key in {'title', 'pub_year'}: self.assertEqual(pub['bib'][key], same_article['bib'][key])