mirror of
https://github.com/hastagAB/Awesome-Python-Scripts.git
synced 2024-11-27 14:01:09 +00:00
Add files via upload (#120)
This commit is contained in:
parent
ded3ba322d
commit
f016839ad0
21
Tweets_Tool/LICENSE
Normal file
21
Tweets_Tool/LICENSE
Normal file
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2018 Sarah Floris
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
71
Tweets_Tool/README.md
Normal file
71
Tweets_Tool/README.md
Normal file
|
@ -0,0 +1,71 @@
|
|||
# Tweets_Tool
|
||||
|
||||
Twitter Official API has limitations on how many tweets you can search for at a time and for the length of time (7 days/10 days). If I wanted to some historical tweets matching a certain criteria, I would have to either buy the Enterprise API or get GNIP, which both cost moneys.
|
||||
|
||||
## Prerequisites
|
||||
Since Python 2.x will be deprecated, this package assumes Python 3.x.
|
||||
Expected package dependencies are listed in the "requirements.txt" file for PIP, you need to run the following command to get dependencies:
|
||||
pip install -r requirements.txt
|
||||
|
||||
## Components
|
||||
**Tweet**: Model class to give some informations about a specific tweet.
|
||||
* id (str)
|
||||
* permalink (str)
|
||||
* username (str)
|
||||
* text (str)
|
||||
* date (date)
|
||||
* retweets (int)
|
||||
* favorites (int)
|
||||
* mentions (str)
|
||||
* hashtags (str)
|
||||
* geo (str)
|
||||
|
||||
**TweetManager:** A manager class to help getting tweets in Tweet's model
|
||||
* getTweets: Return the list of tweets retrieved by using an instance of TwitterCriteria
|
||||
* getJsonReponse: Actually obtains the tweets and returns an object that can be read
|
||||
|
||||
**TwitterCriteria:** A collection of search parameters to be used together with TweetManager
|
||||
* setUsername (str): An optional specific username from a twitter account. Without "@"
|
||||
* setSince (str. "yyyy-mm-dd"): A lower bound date to restrict search
|
||||
* setUntil (str. "yyyy-mm-dd"): An upper bound date to restrist search
|
||||
* setQuerySearch (str): A query text to be matched
|
||||
* setTopTweets (bool): If True only the Top Tweets will be retrieved
|
||||
* setNear(str): A reference location area from where tweets were generated
|
||||
* setWithin (str): A distance radius from "near" location (e.g. 15mi)
|
||||
* setMaxTweets (int): The maximum number of tweets to be retrieved. If this number is unsetted or lower than 1 all possible tweets will be retrieved.
|
||||
|
||||
**TweetObtain:** Returns a clean dataframe for analysis using TweetCriteria and TweetManager
|
||||
* TweetObtain_function: Returns a clean dataframe for analysis using TweetCriteria and TweetManager
|
||||
|
||||
## Simple examples of python usage
|
||||
|
||||
* Get tweets by username
|
||||
``` python
|
||||
tweetCriteria = Tool.TweetCriteria().setUsername('barackobama').setMaxTweets(1)
|
||||
tweet = Tool.TweetManager.getTweets(tweetCriteria)[0]
|
||||
tweets = pd.read_csv('tweets.csv')
|
||||
print(tweets)
|
||||
```
|
||||
* Get tweets by query search
|
||||
``` python
|
||||
tweetCriteria = Tool.TweetCriteria().setQuerySearch('europe refugees').setSince("2015-05-01").setUntil("2015-09-30").setMaxTweets(1)
|
||||
tweet = Tool.TweetManager.getTweets(tweetCriteria)[0]
|
||||
tweets = pd.read_csv('tweets.csv')
|
||||
print(tweets)
|
||||
```
|
||||
* Get tweets by username and bound dates
|
||||
``` python
|
||||
tweetCriteria = Tool.TweetCriteria().setUsername("barackobama").setSince("2015-09-10").setUntil("2015-09-12").setMaxTweets(1)
|
||||
tweet = Tool.TweetManager.getTweets(tweetCriteria)[0]
|
||||
tweets = pd.read_csv('tweets.csv')
|
||||
print(tweets)
|
||||
```
|
||||
* Get the last 10 top tweets by username
|
||||
``` python
|
||||
tweetCriteria = Tool.TweetCriteria().setUsername("barackobama").setTopTweets(True).setMaxTweets(10)
|
||||
# first one
|
||||
tweet = Tool.TweetManager.getTweets(tweetCriteria)[0]
|
||||
tweets = pd.read_csv('tweets.csv')
|
||||
print(tweets)
|
||||
```
|
||||
|
224
Tweets_Tool/Tool.py
Normal file
224
Tweets_Tool/Tool.py
Normal file
|
@ -0,0 +1,224 @@
|
|||
try:
|
||||
import http.cookiejar as cookielib
|
||||
except ImportError:
|
||||
import cookielib
|
||||
|
||||
import urllib, json, re, datetime, sys
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pyquery import PyQuery
|
||||
|
||||
class TweetCriteria:
|
||||
def __init__(self):
|
||||
self.maxTweets = 0
|
||||
|
||||
def setUsername(self, username):
|
||||
self.username = username
|
||||
return self
|
||||
|
||||
def setSince(self, since):
|
||||
self.since = since
|
||||
return self
|
||||
|
||||
def setUntil(self, until):
|
||||
self.until = until
|
||||
return self
|
||||
|
||||
def setQuerySearch(self, querySearch):
|
||||
self.querySearch = querySearch
|
||||
return self
|
||||
|
||||
def setMaxTweets(self, maxTweets):
|
||||
self.maxTweets = maxTweets
|
||||
return self
|
||||
|
||||
def setLang(self, Lang):
|
||||
self.lang = Lang
|
||||
return self
|
||||
|
||||
def setTopTweets(self, topTweets):
|
||||
self.topTweets = topTweets
|
||||
return self
|
||||
|
||||
|
||||
class Tweet:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
|
||||
class TweetManager:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def getTweets(tweetCriteria,
|
||||
receiveBuffer=None,
|
||||
bufferLength=100,
|
||||
proxy=None):
|
||||
"""Return the list of tweets retrieved by using an instance of TwitterCriteria"""
|
||||
refreshCursor = ''
|
||||
results = []
|
||||
resultsAux = []
|
||||
cookieJar = cookielib.CookieJar()
|
||||
|
||||
if hasattr(
|
||||
tweetCriteria,
|
||||
'username') and (tweetCriteria.username.startswith("\'")
|
||||
or tweetCriteria.username.startswith("\"")) and (
|
||||
tweetCriteria.username.endswith("\'")
|
||||
or tweetCriteria.username.endswith("\"")):
|
||||
tweetCriteria.username = tweetCriteria.username[1:-1]
|
||||
|
||||
active = True
|
||||
|
||||
while active:
|
||||
json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor,
|
||||
cookieJar, proxy)
|
||||
if len(json['items_html'].strip()) == 0:
|
||||
break
|
||||
|
||||
refreshCursor = json['min_position']
|
||||
scrapedTweets = PyQuery(json['items_html'])
|
||||
#Remove incomplete tweets withheld by Twitter Guidelines
|
||||
scrapedTweets.remove('div.withheld-tweet')
|
||||
tweets = scrapedTweets('div.js-stream-tweet')
|
||||
|
||||
if len(tweets) == 0:
|
||||
break
|
||||
|
||||
for tweetHTML in tweets:
|
||||
tweetPQ = PyQuery(tweetHTML)
|
||||
tweet = Tweet()
|
||||
|
||||
usernameTweet = tweetPQ("span:first.username.u-dir b").text()
|
||||
txt = re.sub(r"\s+", " ",
|
||||
tweetPQ("p.js-tweet-text").text().replace(
|
||||
'# ', '#').replace('@ ', '@'))
|
||||
retweets = int(
|
||||
tweetPQ(
|
||||
"span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
|
||||
).attr("data-tweet-stat-count").replace(",", ""))
|
||||
favorites = int(
|
||||
tweetPQ(
|
||||
"span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
|
||||
).attr("data-tweet-stat-count").replace(",", ""))
|
||||
dateSec = int(
|
||||
tweetPQ("small.time span.js-short-timestamp").attr(
|
||||
"data-time"))
|
||||
id = tweetPQ.attr("data-tweet-id")
|
||||
permalink = tweetPQ.attr("data-permalink-path")
|
||||
|
||||
geo = ''
|
||||
geoSpan = tweetPQ('span.Tweet-geo')
|
||||
if len(geoSpan) > 0:
|
||||
geo = geoSpan.attr('title')
|
||||
|
||||
tweet.id = id
|
||||
tweet.permalink = 'https://twitter.com' + permalink
|
||||
tweet.username = usernameTweet
|
||||
tweet.text = txt
|
||||
tweet.date = datetime.datetime.fromtimestamp(dateSec)
|
||||
tweet.retweets = retweets
|
||||
tweet.favorites = favorites
|
||||
tweet.mentions = " ".join(
|
||||
re.compile('(@\\w*)').findall(tweet.text))
|
||||
tweet.hashtags = " ".join(
|
||||
re.compile('(#\\w*)').findall(tweet.text))
|
||||
tweet.geo = geo
|
||||
|
||||
results.append(tweet)
|
||||
resultsAux.append(tweet)
|
||||
|
||||
if receiveBuffer and len(resultsAux) >= bufferLength:
|
||||
receiveBuffer(resultsAux)
|
||||
resultsAux = []
|
||||
|
||||
if tweetCriteria.maxTweets > 0 and len(
|
||||
results) >= tweetCriteria.maxTweets:
|
||||
active = False
|
||||
break
|
||||
|
||||
if receiveBuffer and len(resultsAux) > 0:
|
||||
receiveBuffer(resultsAux)
|
||||
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy):
|
||||
"""Actually obtains the tweets and returns an object that can be read"""
|
||||
url = "https://twitter.com/i/search/timeline?f=tweets&q=%s&src=typd&max_position=%s"
|
||||
|
||||
urlGetData = ''
|
||||
|
||||
if hasattr(tweetCriteria, 'username'):
|
||||
urlGetData += ' from:' + tweetCriteria.username
|
||||
|
||||
if hasattr(tweetCriteria, 'querySearch'):
|
||||
urlGetData += ' ' + tweetCriteria.querySearch
|
||||
|
||||
if hasattr(tweetCriteria, 'near'):
|
||||
urlGetData += "&near:" + tweetCriteria.near + " within:" + tweetCriteria.within
|
||||
|
||||
if hasattr(tweetCriteria, 'since'):
|
||||
urlGetData += ' since:' + tweetCriteria.since
|
||||
|
||||
if hasattr(tweetCriteria, 'until'):
|
||||
urlGetData += ' until:' + tweetCriteria.until
|
||||
|
||||
if hasattr(tweetCriteria, 'topTweets'):
|
||||
if tweetCriteria.topTweets:
|
||||
url = "https://twitter.com/i/search/timeline?q=%s&src=typd&max_position=%s"
|
||||
url = url % (urllib.parse.quote(urlGetData), refreshCursor)
|
||||
|
||||
headers = [('Host', "twitter.com"),
|
||||
('User-Agent', "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"),
|
||||
('Accept',
|
||||
"application/json, text/javascript, */*; q=0.01"),
|
||||
('Accept-Language', "de,en-US;q=0.7,en;q=0.3"),
|
||||
('X-Requested-With',
|
||||
"XMLHttpRequest"), ('Referer', url), ('Connection',
|
||||
"keep-alive")]
|
||||
if proxy:
|
||||
opener = urllib.request.build_opener(
|
||||
urllib.request.ProxyHandler({
|
||||
'http': proxy,
|
||||
'https': proxy
|
||||
}), urllib.HTTPCookieProcessor(cookieJar))
|
||||
else:
|
||||
opener = urllib.request.build_opener(
|
||||
urllib.request.HTTPCookieProcessor(cookieJar))
|
||||
opener.addheaders = headers
|
||||
|
||||
try:
|
||||
response = opener.open(url)
|
||||
jsonResponse = response.read()
|
||||
except:
|
||||
print(
|
||||
"Twitter weird response. Try to see on browser: https://twitter.com/search?q=%s&src=typd"
|
||||
% urllib.parse.quote(urlGetData))
|
||||
sys.exit()
|
||||
return
|
||||
dataJson = json.loads(jsonResponse)
|
||||
|
||||
return dataJson
|
||||
|
||||
|
||||
class TweetObtain:
|
||||
def __init__(self):
|
||||
pass
|
||||
def TweetObtain_function(self,videogame):
|
||||
"""Returns a clean dataframe for analysis using TweetCriteria and TweetManager"""
|
||||
print(videogame)
|
||||
tweet_date = []
|
||||
tweet_text = []
|
||||
tweetCriteria = TweetCriteria().setQuerySearch(videogame[0]). \
|
||||
setSince(videogame[1]).setUntil(videogame[2]).setMaxTweets(1000)
|
||||
tweets = TweetManager().getTweets(tweetCriteria)
|
||||
for tweet in tweets:
|
||||
tweet_date.append(tweet.date)
|
||||
tweet_text.append(tweet.text)
|
||||
df = pd.DataFrame(np.column_stack((tweet_date, tweet_text)))
|
||||
df['name'] = videogame[0]
|
||||
df['start_date'] = videogame[1]
|
||||
df['end_date'] = videogame[2]
|
||||
return df
|
2
Tweets_Tool/example.csv
Normal file
2
Tweets_Tool/example.csv
Normal file
|
@ -0,0 +1,2 @@
|
|||
0,1,2,3
|
||||
0,Bayonetta 2,18 Oct 85 ,18 Oct 85
|
|
24
Tweets_Tool/main.py
Normal file
24
Tweets_Tool/main.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
import sys
|
||||
assert sys.version_info >= (3, 0)
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from multiprocessing.pool import ThreadPool
|
||||
import Tool
|
||||
|
||||
|
||||
def main(filepath):
|
||||
assert isinstance(filepath, str)
|
||||
videogames = pd.read_csv(filepath, skiprows=1, names=['games', 'start_date', 'end_date'])
|
||||
videogames = videogames.values
|
||||
pool = ThreadPool(250)
|
||||
text_results = pool.map(Tool.TweetObtain().TweetObtain_function,
|
||||
videogames)
|
||||
pool.close()
|
||||
pool.join()
|
||||
text_results = pd.DataFrame(
|
||||
np.vstack(text_results))
|
||||
text_results.to_csv('tweets.csv')
|
||||
return
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv[1])
|
5
Tweets_Tool/requirements.txt
Normal file
5
Tweets_Tool/requirements.txt
Normal file
|
@ -0,0 +1,5 @@
|
|||
requests==2.18.4
|
||||
urllib3==1.22
|
||||
py4j==0.10.4
|
||||
BeautifulSoup==3.2.0
|
||||
numpy==1.9.2
|
11
Tweets_Tool/setup.py
Normal file
11
Tweets_Tool/setup.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
from setuptools import setup
|
||||
|
||||
setup(name='Twitter_Tool',
|
||||
version='0.1',
|
||||
description='Web scraping ',
|
||||
url='http://github.com/sdf94/Twitter_Tool',
|
||||
author='Sarah Floris',
|
||||
author_email='sdf11c@acu.edu',
|
||||
license='MIT',
|
||||
packages=['Twitter_Tool'],
|
||||
zip_safe=False)
|
Loading…
Reference in New Issue
Block a user