From 7256ad9e864240ae3651d98c1760ee0d5eb6ff6b Mon Sep 17 00:00:00 2001 From: nitish-iiitd Date: Wed, 10 Oct 2018 01:06:59 +0530 Subject: [PATCH 1/2] Added a Simple Webpage Parser --- SimpleWebpageParser/README.md | 11 +++++++++++ SimpleWebpageParser/SimpleWebpageParser.py | 13 +++++++++++++ SimpleWebpageParser/__init__.py | 0 SimpleWebpageParser/webpage_parser.py | 8 ++++++++ 4 files changed, 32 insertions(+) create mode 100644 SimpleWebpageParser/README.md create mode 100644 SimpleWebpageParser/SimpleWebpageParser.py create mode 100644 SimpleWebpageParser/__init__.py create mode 100644 SimpleWebpageParser/webpage_parser.py diff --git a/SimpleWebpageParser/README.md b/SimpleWebpageParser/README.md new file mode 100644 index 0000000..55d6bc9 --- /dev/null +++ b/SimpleWebpageParser/README.md @@ -0,0 +1,11 @@ +# Simple Webpage Parser +A simple wrapper around the popular web scraper library BeautifulSoap. It merges the use of Requests and BeautifulSoap library in one class which abstracts the extracting of html from web url logic and gives user a clean code to work with. + +## Libraries Required +1. requests +`$pip install requests` +2. beautifulsoup4 +`$pip install beautifulsoup4` + +## Usage +A sample script `webpage_parser.py` has been provided to show the usage of the SimpleWebpageParser. It prints all the links from the Hacktoberfest's home page. \ No newline at end of file diff --git a/SimpleWebpageParser/SimpleWebpageParser.py b/SimpleWebpageParser/SimpleWebpageParser.py new file mode 100644 index 0000000..30bc38b --- /dev/null +++ b/SimpleWebpageParser/SimpleWebpageParser.py @@ -0,0 +1,13 @@ +import requests +from bs4 import BeautifulSoup + +class SimpleWebpageParser(): + + def __init__(self, url): + self.url = url + + def getHTML(self): + r = requests.get(self.url) + data = r.text + soup = BeautifulSoup(data,"lxml") + return soup \ No newline at end of file diff --git a/SimpleWebpageParser/__init__.py b/SimpleWebpageParser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/SimpleWebpageParser/webpage_parser.py b/SimpleWebpageParser/webpage_parser.py new file mode 100644 index 0000000..b0fb40a --- /dev/null +++ b/SimpleWebpageParser/webpage_parser.py @@ -0,0 +1,8 @@ +from SimpleWebpageParser import SimpleWebpageParser + +swp = SimpleWebpageParser("https://hacktoberfest.digitalocean.com/") +html = swp.getHTML() +print html.find_all('a') + +## the html returned is an object of type BeatifulSoup, you can parse using BeautifulSoup syntax +## refer to its documentation for more functionalities \ No newline at end of file From c5f6bf0609769b24bae13fc902b9a1abf85ef323 Mon Sep 17 00:00:00 2001 From: nitish-iiitd Date: Wed, 10 Oct 2018 01:13:46 +0530 Subject: [PATCH 2/2] Updated README --- SimpleWebpageParser/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SimpleWebpageParser/README.md b/SimpleWebpageParser/README.md index 55d6bc9..13bf4b2 100644 --- a/SimpleWebpageParser/README.md +++ b/SimpleWebpageParser/README.md @@ -1,5 +1,5 @@ # Simple Webpage Parser -A simple wrapper around the popular web scraper library BeautifulSoap. It merges the use of Requests and BeautifulSoap library in one class which abstracts the extracting of html from web url logic and gives user a clean code to work with. +A simple wrapper around the popular web scraper library BeautifulSoap. It merges the use of Requests and BeautifulSoap library in one class which abstracts the process of extraction of html from webpage's url and gives user a clean code to work with. ## Libraries Required 1. requests