mirror of
https://github.com/hastagAB/Awesome-Python-Scripts.git
synced 2024-11-23 20:11:07 +00:00
Added a Simple Webpage Parser
This commit is contained in:
parent
b24df42521
commit
7256ad9e86
11
SimpleWebpageParser/README.md
Normal file
11
SimpleWebpageParser/README.md
Normal file
|
@ -0,0 +1,11 @@
|
|||
# Simple Webpage Parser
|
||||
A simple wrapper around the popular web scraper library BeautifulSoap. It merges the use of Requests and BeautifulSoap library in one class which abstracts the extracting of html from web url logic and gives user a clean code to work with.
|
||||
|
||||
## Libraries Required
|
||||
1. requests
|
||||
`$pip install requests`
|
||||
2. beautifulsoup4
|
||||
`$pip install beautifulsoup4`
|
||||
|
||||
## Usage
|
||||
A sample script `webpage_parser.py` has been provided to show the usage of the SimpleWebpageParser. It prints all the links from the Hacktoberfest's home page.
|
13
SimpleWebpageParser/SimpleWebpageParser.py
Normal file
13
SimpleWebpageParser/SimpleWebpageParser.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
class SimpleWebpageParser():
|
||||
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
|
||||
def getHTML(self):
|
||||
r = requests.get(self.url)
|
||||
data = r.text
|
||||
soup = BeautifulSoup(data,"lxml")
|
||||
return soup
|
0
SimpleWebpageParser/__init__.py
Normal file
0
SimpleWebpageParser/__init__.py
Normal file
8
SimpleWebpageParser/webpage_parser.py
Normal file
8
SimpleWebpageParser/webpage_parser.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
from SimpleWebpageParser import SimpleWebpageParser
|
||||
|
||||
swp = SimpleWebpageParser("https://hacktoberfest.digitalocean.com/")
|
||||
html = swp.getHTML()
|
||||
print html.find_all('a')
|
||||
|
||||
## the html returned is an object of type BeatifulSoup, you can parse using BeautifulSoup syntax
|
||||
## refer to its documentation for more functionalities
|
Loading…
Reference in New Issue
Block a user