From 1ca59d0ee105db368f338c5d0f8cb24396e05398 Mon Sep 17 00:00:00 2001 From: Alfian Ali Murtadlo <115053112+AlfianAliM@users.noreply.github.com> Date: Wed, 2 Oct 2024 08:25:46 +0700 Subject: [PATCH 1/2] Update crawl_google_scholar_citation.py --- .../crawl_google_scholar_citation.py | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/web_programming/crawl_google_scholar_citation.py b/web_programming/crawl_google_scholar_citation.py index 5f2ccad5f..fee483686 100644 --- a/web_programming/crawl_google_scholar_citation.py +++ b/web_programming/crawl_google_scholar_citation.py @@ -6,20 +6,33 @@ using title and year of publication, and volume and pages of journal. import requests from bs4 import BeautifulSoup - def get_citation(base_url: str, params: dict) -> str: """ - Return the citation number. + Returns the citation number for a publication based on its title, journal, volume, + pages, and year of publication. + + Parameters: + - base_url: The base URL for making requests to Google Scholar. + - params: A dictionary containing the publication information. + + Returns: + - A string containing the number of citations. """ + # Send a GET request to the URL with the specified parameters soup = BeautifulSoup( requests.get(base_url, params=params, timeout=10).content, "html.parser" ) + + # Find the div element with class 'gs_ri' that contains citation information div = soup.find("div", attrs={"class": "gs_ri"}) + + # Find all links in the div and retrieve the third link (the citation count) anchors = div.find("div", attrs={"class": "gs_fl"}).find_all("a") - return anchors[2].get_text() - + + return anchors[2].get_text() # Return the text from the third link if __name__ == "__main__": + # Define parameters for the publication whose citation is to be searched params = { "title": ( "Precisely geometry controlled microsupercapacitors for ultrahigh areal " @@ -29,6 +42,8 @@ if __name__ == "__main__": "volume": 30, "pages": "3979-3990", "year": 2018, - "hl": "en", + "hl": "en", # Language to be used (English) } + + # Call the get_citation function with the specified URL and parameters print(get_citation("https://scholar.google.com/scholar_lookup", params=params)) From 355ae4c6c37dde4dc343d50b99fb1ff2cca505d1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 2 Oct 2024 01:27:58 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- web_programming/crawl_google_scholar_citation.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/web_programming/crawl_google_scholar_citation.py b/web_programming/crawl_google_scholar_citation.py index fee483686..12072d0dd 100644 --- a/web_programming/crawl_google_scholar_citation.py +++ b/web_programming/crawl_google_scholar_citation.py @@ -6,15 +6,16 @@ using title and year of publication, and volume and pages of journal. import requests from bs4 import BeautifulSoup + def get_citation(base_url: str, params: dict) -> str: """ - Returns the citation number for a publication based on its title, journal, volume, + Returns the citation number for a publication based on its title, journal, volume, pages, and year of publication. - + Parameters: - base_url: The base URL for making requests to Google Scholar. - params: A dictionary containing the publication information. - + Returns: - A string containing the number of citations. """ @@ -22,15 +23,16 @@ def get_citation(base_url: str, params: dict) -> str: soup = BeautifulSoup( requests.get(base_url, params=params, timeout=10).content, "html.parser" ) - + # Find the div element with class 'gs_ri' that contains citation information div = soup.find("div", attrs={"class": "gs_ri"}) - + # Find all links in the div and retrieve the third link (the citation count) anchors = div.find("div", attrs={"class": "gs_fl"}).find_all("a") - + return anchors[2].get_text() # Return the text from the third link + if __name__ == "__main__": # Define parameters for the publication whose citation is to be searched params = { @@ -44,6 +46,6 @@ if __name__ == "__main__": "year": 2018, "hl": "en", # Language to be used (English) } - + # Call the get_citation function with the specified URL and parameters print(get_citation("https://scholar.google.com/scholar_lookup", params=params))