From eecaae552bee6f031f9416604f5d1234550c240f Mon Sep 17 00:00:00 2001 From: L Date: Fri, 12 Apr 2024 09:46:49 +1000 Subject: [PATCH 1/3] Add in PDF link in publication fill --- scholarly/publication_parser.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scholarly/publication_parser.py b/scholarly/publication_parser.py index ca3ca16..3040f92 100644 --- a/scholarly/publication_parser.py +++ b/scholarly/publication_parser.py @@ -286,6 +286,10 @@ def fill(self, publication: Publication)->Publication: if soup.find('a', class_='gsc_oci_title_link'): publication['pub_url'] = soup.find( 'a', class_='gsc_oci_title_link')['href'] + if soup.find('div', class_='gsc_oci_title_ggi'): + link = soup.find('a', attrs={'data-clk': True}) + if link: + publication['pdf_url'] = link['href'] for item in soup.find_all('div', class_='gs_scl'): key = item.find(class_='gsc_oci_field').text.strip().lower() val = item.find(class_='gsc_oci_value') From 63f35925081be9040ead4dc8decc50e30f750cc8 Mon Sep 17 00:00:00 2001 From: L Date: Fri, 12 Apr 2024 10:11:32 +1000 Subject: [PATCH 2/3] Update tests and add in pdf url from search results --- scholarly/publication_parser.py | 4 ++++ test_module.py | 6 ++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/scholarly/publication_parser.py b/scholarly/publication_parser.py index 3040f92..cb205d5 100644 --- a/scholarly/publication_parser.py +++ b/scholarly/publication_parser.py @@ -202,6 +202,10 @@ def _scholar_pub(self, __data, publication: Publication): if title.find('a'): publication['pub_url'] = title.find('a')['href'] + pdf_div = __data.find('div', class_='gs_ggs gs_fl') + if pdf_div and pdf_div.find('a', href=True): + publication['pdf_url'] = pdf_div.find('a')['href'] + author_div_element = databox.find('div', class_='gs_a') authorinfo = author_div_element.text authorinfo = authorinfo.replace(u'\xa0', u' ') # NBSP diff --git a/test_module.py b/test_module.py index e54e1aa..153c790 100644 --- a/test_module.py +++ b/test_module.py @@ -724,7 +724,7 @@ def test_search_pubs(self): pubs = list(scholarly.search_pubs('"naive physics" stability "3d shape"')) # Check that the first entry in pubs is the same as pub. # Checking for quality holds for non-dict entries only. - for key in {'author_id', 'pub_url', 'num_citations'}: + for key in {'author_id', 'pub_url', 'pdf_url', 'num_citations'}: self.assertEqual(pub[key], pubs[0][key]) for key in {'title', 'pub_year', 'venue'}: self.assertEqual(pub['bib'][key], pubs[0]['bib'][key]) @@ -784,6 +784,7 @@ def test_search_pubs_filling_publication_contents(self): self.assertTrue(f['bib']['publisher'] == u'The Association for Research in Vision and Ophthalmology') self.assertTrue(f['bib']['title'] == u'Creating correct blur and its effect on accommodation') self.assertTrue(f['pub_url'] == u'https://jov.arvojournals.org/article.aspx?articleid=2701817') + self.assertTrue(f['pdf_url'] == u'https://jov.arvojournals.org/arvo/content_public/journal/jov/937491/i1534-7362-18-9-1.pdf') self.assertTrue(f['bib']['volume'] == '18') self.assertTrue(f['bib']['pub_year'] == u'2018') @@ -800,6 +801,7 @@ def test_related_articles_from_author(self): # Typically, the same publication is returned as the most related article same_article = next(related_articles) self.assertEqual(pub["pub_url"], same_article["pub_url"]) + self.assertEqual(pub["pdf_url"], same_article["pdf_url"]) for key in {'title', 'pub_year'}: self.assertEqual(str(pub['bib'][key]), (same_article['bib'][key])) @@ -818,7 +820,7 @@ def test_related_articles_from_publication(self): related_articles = scholarly.get_related_articles(pub) # Typically, the same publication is returned as the most related article same_article = next(related_articles) - for key in {'author_id', 'pub_url', 'num_citations'}: + for key in {'author_id', 'pub_url', 'pdf_url', 'num_citations'}: self.assertEqual(pub[key], same_article[key]) for key in {'title', 'pub_year'}: self.assertEqual(pub['bib'][key], same_article['bib'][key]) From 35f97d7bae60be438b125f7c3008ded2d0e929c1 Mon Sep 17 00:00:00 2001 From: L Date: Thu, 12 Dec 2024 12:30:11 +1000 Subject: [PATCH 3/3] Renamed "pdf_url" to "eprint_url" --- scholarly/publication_parser.py | 4 ++-- test_module.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/scholarly/publication_parser.py b/scholarly/publication_parser.py index cb205d5..e03113b 100644 --- a/scholarly/publication_parser.py +++ b/scholarly/publication_parser.py @@ -204,7 +204,7 @@ def _scholar_pub(self, __data, publication: Publication): pdf_div = __data.find('div', class_='gs_ggs gs_fl') if pdf_div and pdf_div.find('a', href=True): - publication['pdf_url'] = pdf_div.find('a')['href'] + publication['eprint_url'] = pdf_div.find('a')['href'] author_div_element = databox.find('div', class_='gs_a') authorinfo = author_div_element.text @@ -293,7 +293,7 @@ def fill(self, publication: Publication)->Publication: if soup.find('div', class_='gsc_oci_title_ggi'): link = soup.find('a', attrs={'data-clk': True}) if link: - publication['pdf_url'] = link['href'] + publication['eprint_url'] = link['href'] for item in soup.find_all('div', class_='gs_scl'): key = item.find(class_='gsc_oci_field').text.strip().lower() val = item.find(class_='gsc_oci_value') diff --git a/test_module.py b/test_module.py index 153c790..bf464e3 100644 --- a/test_module.py +++ b/test_module.py @@ -724,7 +724,7 @@ def test_search_pubs(self): pubs = list(scholarly.search_pubs('"naive physics" stability "3d shape"')) # Check that the first entry in pubs is the same as pub. # Checking for quality holds for non-dict entries only. - for key in {'author_id', 'pub_url', 'pdf_url', 'num_citations'}: + for key in {'author_id', 'pub_url', 'eprint_url', 'num_citations'}: self.assertEqual(pub[key], pubs[0][key]) for key in {'title', 'pub_year', 'venue'}: self.assertEqual(pub['bib'][key], pubs[0]['bib'][key]) @@ -784,7 +784,7 @@ def test_search_pubs_filling_publication_contents(self): self.assertTrue(f['bib']['publisher'] == u'The Association for Research in Vision and Ophthalmology') self.assertTrue(f['bib']['title'] == u'Creating correct blur and its effect on accommodation') self.assertTrue(f['pub_url'] == u'https://jov.arvojournals.org/article.aspx?articleid=2701817') - self.assertTrue(f['pdf_url'] == u'https://jov.arvojournals.org/arvo/content_public/journal/jov/937491/i1534-7362-18-9-1.pdf') + self.assertTrue(f['eprint_url'] == u'https://jov.arvojournals.org/arvo/content_public/journal/jov/937491/i1534-7362-18-9-1.pdf') self.assertTrue(f['bib']['volume'] == '18') self.assertTrue(f['bib']['pub_year'] == u'2018') @@ -801,7 +801,7 @@ def test_related_articles_from_author(self): # Typically, the same publication is returned as the most related article same_article = next(related_articles) self.assertEqual(pub["pub_url"], same_article["pub_url"]) - self.assertEqual(pub["pdf_url"], same_article["pdf_url"]) + self.assertEqual(pub["eprint_url"], same_article["eprint_url"]) for key in {'title', 'pub_year'}: self.assertEqual(str(pub['bib'][key]), (same_article['bib'][key])) @@ -820,7 +820,7 @@ def test_related_articles_from_publication(self): related_articles = scholarly.get_related_articles(pub) # Typically, the same publication is returned as the most related article same_article = next(related_articles) - for key in {'author_id', 'pub_url', 'pdf_url', 'num_citations'}: + for key in {'author_id', 'pub_url', 'eprint_url', 'num_citations'}: self.assertEqual(pub[key], same_article[key]) for key in {'title', 'pub_year'}: self.assertEqual(pub['bib'][key], same_article['bib'][key])