diff --git a/web_programming/crawl_google_scholar_citation.py b/web_programming/crawl_google_scholar_citation.py index fee483686..12072d0dd 100644 --- a/web_programming/crawl_google_scholar_citation.py +++ b/web_programming/crawl_google_scholar_citation.py @@ -6,15 +6,16 @@ using title and year of publication, and volume and pages of journal. import requests from bs4 import BeautifulSoup + def get_citation(base_url: str, params: dict) -> str: """ - Returns the citation number for a publication based on its title, journal, volume, + Returns the citation number for a publication based on its title, journal, volume, pages, and year of publication. - + Parameters: - base_url: The base URL for making requests to Google Scholar. - params: A dictionary containing the publication information. - + Returns: - A string containing the number of citations. """ @@ -22,15 +23,16 @@ def get_citation(base_url: str, params: dict) -> str: soup = BeautifulSoup( requests.get(base_url, params=params, timeout=10).content, "html.parser" ) - + # Find the div element with class 'gs_ri' that contains citation information div = soup.find("div", attrs={"class": "gs_ri"}) - + # Find all links in the div and retrieve the third link (the citation count) anchors = div.find("div", attrs={"class": "gs_fl"}).find_all("a") - + return anchors[2].get_text() # Return the text from the third link + if __name__ == "__main__": # Define parameters for the publication whose citation is to be searched params = { @@ -44,6 +46,6 @@ if __name__ == "__main__": "year": 2018, "hl": "en", # Language to be used (English) } - + # Call the get_citation function with the specified URL and parameters print(get_citation("https://scholar.google.com/scholar_lookup", params=params))