Merge pull request #285 from drk1rd/master

Tracking webpages [ISSUE#280]
This commit is contained in:
Bartick Maiti 2022-10-09 23:59:39 +05:30 committed by GitHub
commit ac5446da60
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 48 additions and 0 deletions

View File

@ -0,0 +1,5 @@
Tracking any change in a webpage using Python.
- Input the url with proper format(with https:// and so on).
- The program checks the site periodically, so input an interval in seconds.
- Look at your screen.

View File

@ -0,0 +1,40 @@
import requests
from bs4 import BeautifulSoup
import difflib
import time
from datetime import datetime
url = str(input("url: "))
interval = int(input("interval(s): "))
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
PrevVersion = ""
FirstRun = True
while True:
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")
for script in soup(["script", "style"]):
script.extract()
soup = soup.get_text()
if PrevVersion != soup:
if FirstRun == True:
PrevVersion = soup
FirstRun = False
print("Started Monitoring " + url + " " + str(datetime.now()))
else:
print("Changes detected at: " + str(datetime.now()))
OldPage = PrevVersion.splitlines()
NewPage = soup.splitlines()
d = difflib.Differ()
diff = d.compare(OldPage, NewPage)
out_text = "\n".join([ll.rstrip() for ll in '\n'.join(diff).splitlines() if ll.strip()])
#print(out_text)
OldPage = NewPage
# print ('\n'.join(diff))
PrevVersion = soup
else:
print("No Changes Detected " + str(datetime.now()))
time.sleep(interval)
continue

View File

@ -0,0 +1,3 @@
bs4==0.0.1
lxml==4.9.1
requests==2.28.1