mirror of
https://github.com/hastagAB/Awesome-Python-Scripts.git
synced 2024-11-24 04:21:08 +00:00
Add files via upload (#120)
This commit is contained in:
parent
ded3ba322d
commit
f016839ad0
21
Tweets_Tool/LICENSE
Normal file
21
Tweets_Tool/LICENSE
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2018 Sarah Floris
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
71
Tweets_Tool/README.md
Normal file
71
Tweets_Tool/README.md
Normal file
|
@ -0,0 +1,71 @@
|
||||||
|
# Tweets_Tool
|
||||||
|
|
||||||
|
Twitter Official API has limitations on how many tweets you can search for at a time and for the length of time (7 days/10 days). If I wanted to some historical tweets matching a certain criteria, I would have to either buy the Enterprise API or get GNIP, which both cost moneys.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
Since Python 2.x will be deprecated, this package assumes Python 3.x.
|
||||||
|
Expected package dependencies are listed in the "requirements.txt" file for PIP, you need to run the following command to get dependencies:
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
## Components
|
||||||
|
**Tweet**: Model class to give some informations about a specific tweet.
|
||||||
|
* id (str)
|
||||||
|
* permalink (str)
|
||||||
|
* username (str)
|
||||||
|
* text (str)
|
||||||
|
* date (date)
|
||||||
|
* retweets (int)
|
||||||
|
* favorites (int)
|
||||||
|
* mentions (str)
|
||||||
|
* hashtags (str)
|
||||||
|
* geo (str)
|
||||||
|
|
||||||
|
**TweetManager:** A manager class to help getting tweets in Tweet's model
|
||||||
|
* getTweets: Return the list of tweets retrieved by using an instance of TwitterCriteria
|
||||||
|
* getJsonReponse: Actually obtains the tweets and returns an object that can be read
|
||||||
|
|
||||||
|
**TwitterCriteria:** A collection of search parameters to be used together with TweetManager
|
||||||
|
* setUsername (str): An optional specific username from a twitter account. Without "@"
|
||||||
|
* setSince (str. "yyyy-mm-dd"): A lower bound date to restrict search
|
||||||
|
* setUntil (str. "yyyy-mm-dd"): An upper bound date to restrist search
|
||||||
|
* setQuerySearch (str): A query text to be matched
|
||||||
|
* setTopTweets (bool): If True only the Top Tweets will be retrieved
|
||||||
|
* setNear(str): A reference location area from where tweets were generated
|
||||||
|
* setWithin (str): A distance radius from "near" location (e.g. 15mi)
|
||||||
|
* setMaxTweets (int): The maximum number of tweets to be retrieved. If this number is unsetted or lower than 1 all possible tweets will be retrieved.
|
||||||
|
|
||||||
|
**TweetObtain:** Returns a clean dataframe for analysis using TweetCriteria and TweetManager
|
||||||
|
* TweetObtain_function: Returns a clean dataframe for analysis using TweetCriteria and TweetManager
|
||||||
|
|
||||||
|
## Simple examples of python usage
|
||||||
|
|
||||||
|
* Get tweets by username
|
||||||
|
``` python
|
||||||
|
tweetCriteria = Tool.TweetCriteria().setUsername('barackobama').setMaxTweets(1)
|
||||||
|
tweet = Tool.TweetManager.getTweets(tweetCriteria)[0]
|
||||||
|
tweets = pd.read_csv('tweets.csv')
|
||||||
|
print(tweets)
|
||||||
|
```
|
||||||
|
* Get tweets by query search
|
||||||
|
``` python
|
||||||
|
tweetCriteria = Tool.TweetCriteria().setQuerySearch('europe refugees').setSince("2015-05-01").setUntil("2015-09-30").setMaxTweets(1)
|
||||||
|
tweet = Tool.TweetManager.getTweets(tweetCriteria)[0]
|
||||||
|
tweets = pd.read_csv('tweets.csv')
|
||||||
|
print(tweets)
|
||||||
|
```
|
||||||
|
* Get tweets by username and bound dates
|
||||||
|
``` python
|
||||||
|
tweetCriteria = Tool.TweetCriteria().setUsername("barackobama").setSince("2015-09-10").setUntil("2015-09-12").setMaxTweets(1)
|
||||||
|
tweet = Tool.TweetManager.getTweets(tweetCriteria)[0]
|
||||||
|
tweets = pd.read_csv('tweets.csv')
|
||||||
|
print(tweets)
|
||||||
|
```
|
||||||
|
* Get the last 10 top tweets by username
|
||||||
|
``` python
|
||||||
|
tweetCriteria = Tool.TweetCriteria().setUsername("barackobama").setTopTweets(True).setMaxTweets(10)
|
||||||
|
# first one
|
||||||
|
tweet = Tool.TweetManager.getTweets(tweetCriteria)[0]
|
||||||
|
tweets = pd.read_csv('tweets.csv')
|
||||||
|
print(tweets)
|
||||||
|
```
|
||||||
|
|
224
Tweets_Tool/Tool.py
Normal file
224
Tweets_Tool/Tool.py
Normal file
|
@ -0,0 +1,224 @@
|
||||||
|
try:
|
||||||
|
import http.cookiejar as cookielib
|
||||||
|
except ImportError:
|
||||||
|
import cookielib
|
||||||
|
|
||||||
|
import urllib, json, re, datetime, sys
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from pyquery import PyQuery
|
||||||
|
|
||||||
|
class TweetCriteria:
|
||||||
|
def __init__(self):
|
||||||
|
self.maxTweets = 0
|
||||||
|
|
||||||
|
def setUsername(self, username):
|
||||||
|
self.username = username
|
||||||
|
return self
|
||||||
|
|
||||||
|
def setSince(self, since):
|
||||||
|
self.since = since
|
||||||
|
return self
|
||||||
|
|
||||||
|
def setUntil(self, until):
|
||||||
|
self.until = until
|
||||||
|
return self
|
||||||
|
|
||||||
|
def setQuerySearch(self, querySearch):
|
||||||
|
self.querySearch = querySearch
|
||||||
|
return self
|
||||||
|
|
||||||
|
def setMaxTweets(self, maxTweets):
|
||||||
|
self.maxTweets = maxTweets
|
||||||
|
return self
|
||||||
|
|
||||||
|
def setLang(self, Lang):
|
||||||
|
self.lang = Lang
|
||||||
|
return self
|
||||||
|
|
||||||
|
def setTopTweets(self, topTweets):
|
||||||
|
self.topTweets = topTweets
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
class Tweet:
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class TweetManager:
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def getTweets(tweetCriteria,
|
||||||
|
receiveBuffer=None,
|
||||||
|
bufferLength=100,
|
||||||
|
proxy=None):
|
||||||
|
"""Return the list of tweets retrieved by using an instance of TwitterCriteria"""
|
||||||
|
refreshCursor = ''
|
||||||
|
results = []
|
||||||
|
resultsAux = []
|
||||||
|
cookieJar = cookielib.CookieJar()
|
||||||
|
|
||||||
|
if hasattr(
|
||||||
|
tweetCriteria,
|
||||||
|
'username') and (tweetCriteria.username.startswith("\'")
|
||||||
|
or tweetCriteria.username.startswith("\"")) and (
|
||||||
|
tweetCriteria.username.endswith("\'")
|
||||||
|
or tweetCriteria.username.endswith("\"")):
|
||||||
|
tweetCriteria.username = tweetCriteria.username[1:-1]
|
||||||
|
|
||||||
|
active = True
|
||||||
|
|
||||||
|
while active:
|
||||||
|
json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor,
|
||||||
|
cookieJar, proxy)
|
||||||
|
if len(json['items_html'].strip()) == 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
refreshCursor = json['min_position']
|
||||||
|
scrapedTweets = PyQuery(json['items_html'])
|
||||||
|
#Remove incomplete tweets withheld by Twitter Guidelines
|
||||||
|
scrapedTweets.remove('div.withheld-tweet')
|
||||||
|
tweets = scrapedTweets('div.js-stream-tweet')
|
||||||
|
|
||||||
|
if len(tweets) == 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
for tweetHTML in tweets:
|
||||||
|
tweetPQ = PyQuery(tweetHTML)
|
||||||
|
tweet = Tweet()
|
||||||
|
|
||||||
|
usernameTweet = tweetPQ("span:first.username.u-dir b").text()
|
||||||
|
txt = re.sub(r"\s+", " ",
|
||||||
|
tweetPQ("p.js-tweet-text").text().replace(
|
||||||
|
'# ', '#').replace('@ ', '@'))
|
||||||
|
retweets = int(
|
||||||
|
tweetPQ(
|
||||||
|
"span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
|
||||||
|
).attr("data-tweet-stat-count").replace(",", ""))
|
||||||
|
favorites = int(
|
||||||
|
tweetPQ(
|
||||||
|
"span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
|
||||||
|
).attr("data-tweet-stat-count").replace(",", ""))
|
||||||
|
dateSec = int(
|
||||||
|
tweetPQ("small.time span.js-short-timestamp").attr(
|
||||||
|
"data-time"))
|
||||||
|
id = tweetPQ.attr("data-tweet-id")
|
||||||
|
permalink = tweetPQ.attr("data-permalink-path")
|
||||||
|
|
||||||
|
geo = ''
|
||||||
|
geoSpan = tweetPQ('span.Tweet-geo')
|
||||||
|
if len(geoSpan) > 0:
|
||||||
|
geo = geoSpan.attr('title')
|
||||||
|
|
||||||
|
tweet.id = id
|
||||||
|
tweet.permalink = 'https://twitter.com' + permalink
|
||||||
|
tweet.username = usernameTweet
|
||||||
|
tweet.text = txt
|
||||||
|
tweet.date = datetime.datetime.fromtimestamp(dateSec)
|
||||||
|
tweet.retweets = retweets
|
||||||
|
tweet.favorites = favorites
|
||||||
|
tweet.mentions = " ".join(
|
||||||
|
re.compile('(@\\w*)').findall(tweet.text))
|
||||||
|
tweet.hashtags = " ".join(
|
||||||
|
re.compile('(#\\w*)').findall(tweet.text))
|
||||||
|
tweet.geo = geo
|
||||||
|
|
||||||
|
results.append(tweet)
|
||||||
|
resultsAux.append(tweet)
|
||||||
|
|
||||||
|
if receiveBuffer and len(resultsAux) >= bufferLength:
|
||||||
|
receiveBuffer(resultsAux)
|
||||||
|
resultsAux = []
|
||||||
|
|
||||||
|
if tweetCriteria.maxTweets > 0 and len(
|
||||||
|
results) >= tweetCriteria.maxTweets:
|
||||||
|
active = False
|
||||||
|
break
|
||||||
|
|
||||||
|
if receiveBuffer and len(resultsAux) > 0:
|
||||||
|
receiveBuffer(resultsAux)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy):
|
||||||
|
"""Actually obtains the tweets and returns an object that can be read"""
|
||||||
|
url = "https://twitter.com/i/search/timeline?f=tweets&q=%s&src=typd&max_position=%s"
|
||||||
|
|
||||||
|
urlGetData = ''
|
||||||
|
|
||||||
|
if hasattr(tweetCriteria, 'username'):
|
||||||
|
urlGetData += ' from:' + tweetCriteria.username
|
||||||
|
|
||||||
|
if hasattr(tweetCriteria, 'querySearch'):
|
||||||
|
urlGetData += ' ' + tweetCriteria.querySearch
|
||||||
|
|
||||||
|
if hasattr(tweetCriteria, 'near'):
|
||||||
|
urlGetData += "&near:" + tweetCriteria.near + " within:" + tweetCriteria.within
|
||||||
|
|
||||||
|
if hasattr(tweetCriteria, 'since'):
|
||||||
|
urlGetData += ' since:' + tweetCriteria.since
|
||||||
|
|
||||||
|
if hasattr(tweetCriteria, 'until'):
|
||||||
|
urlGetData += ' until:' + tweetCriteria.until
|
||||||
|
|
||||||
|
if hasattr(tweetCriteria, 'topTweets'):
|
||||||
|
if tweetCriteria.topTweets:
|
||||||
|
url = "https://twitter.com/i/search/timeline?q=%s&src=typd&max_position=%s"
|
||||||
|
url = url % (urllib.parse.quote(urlGetData), refreshCursor)
|
||||||
|
|
||||||
|
headers = [('Host', "twitter.com"),
|
||||||
|
('User-Agent', "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"),
|
||||||
|
('Accept',
|
||||||
|
"application/json, text/javascript, */*; q=0.01"),
|
||||||
|
('Accept-Language', "de,en-US;q=0.7,en;q=0.3"),
|
||||||
|
('X-Requested-With',
|
||||||
|
"XMLHttpRequest"), ('Referer', url), ('Connection',
|
||||||
|
"keep-alive")]
|
||||||
|
if proxy:
|
||||||
|
opener = urllib.request.build_opener(
|
||||||
|
urllib.request.ProxyHandler({
|
||||||
|
'http': proxy,
|
||||||
|
'https': proxy
|
||||||
|
}), urllib.HTTPCookieProcessor(cookieJar))
|
||||||
|
else:
|
||||||
|
opener = urllib.request.build_opener(
|
||||||
|
urllib.request.HTTPCookieProcessor(cookieJar))
|
||||||
|
opener.addheaders = headers
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = opener.open(url)
|
||||||
|
jsonResponse = response.read()
|
||||||
|
except:
|
||||||
|
print(
|
||||||
|
"Twitter weird response. Try to see on browser: https://twitter.com/search?q=%s&src=typd"
|
||||||
|
% urllib.parse.quote(urlGetData))
|
||||||
|
sys.exit()
|
||||||
|
return
|
||||||
|
dataJson = json.loads(jsonResponse)
|
||||||
|
|
||||||
|
return dataJson
|
||||||
|
|
||||||
|
|
||||||
|
class TweetObtain:
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
def TweetObtain_function(self,videogame):
|
||||||
|
"""Returns a clean dataframe for analysis using TweetCriteria and TweetManager"""
|
||||||
|
print(videogame)
|
||||||
|
tweet_date = []
|
||||||
|
tweet_text = []
|
||||||
|
tweetCriteria = TweetCriteria().setQuerySearch(videogame[0]). \
|
||||||
|
setSince(videogame[1]).setUntil(videogame[2]).setMaxTweets(1000)
|
||||||
|
tweets = TweetManager().getTweets(tweetCriteria)
|
||||||
|
for tweet in tweets:
|
||||||
|
tweet_date.append(tweet.date)
|
||||||
|
tweet_text.append(tweet.text)
|
||||||
|
df = pd.DataFrame(np.column_stack((tweet_date, tweet_text)))
|
||||||
|
df['name'] = videogame[0]
|
||||||
|
df['start_date'] = videogame[1]
|
||||||
|
df['end_date'] = videogame[2]
|
||||||
|
return df
|
2
Tweets_Tool/example.csv
Normal file
2
Tweets_Tool/example.csv
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
0,1,2,3
|
||||||
|
0,Bayonetta 2,18 Oct 85 ,18 Oct 85
|
|
24
Tweets_Tool/main.py
Normal file
24
Tweets_Tool/main.py
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
import sys
|
||||||
|
assert sys.version_info >= (3, 0)
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from multiprocessing.pool import ThreadPool
|
||||||
|
import Tool
|
||||||
|
|
||||||
|
|
||||||
|
def main(filepath):
|
||||||
|
assert isinstance(filepath, str)
|
||||||
|
videogames = pd.read_csv(filepath, skiprows=1, names=['games', 'start_date', 'end_date'])
|
||||||
|
videogames = videogames.values
|
||||||
|
pool = ThreadPool(250)
|
||||||
|
text_results = pool.map(Tool.TweetObtain().TweetObtain_function,
|
||||||
|
videogames)
|
||||||
|
pool.close()
|
||||||
|
pool.join()
|
||||||
|
text_results = pd.DataFrame(
|
||||||
|
np.vstack(text_results))
|
||||||
|
text_results.to_csv('tweets.csv')
|
||||||
|
return
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main(sys.argv[1])
|
5
Tweets_Tool/requirements.txt
Normal file
5
Tweets_Tool/requirements.txt
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
requests==2.18.4
|
||||||
|
urllib3==1.22
|
||||||
|
py4j==0.10.4
|
||||||
|
BeautifulSoup==3.2.0
|
||||||
|
numpy==1.9.2
|
11
Tweets_Tool/setup.py
Normal file
11
Tweets_Tool/setup.py
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
from setuptools import setup
|
||||||
|
|
||||||
|
setup(name='Twitter_Tool',
|
||||||
|
version='0.1',
|
||||||
|
description='Web scraping ',
|
||||||
|
url='http://github.com/sdf94/Twitter_Tool',
|
||||||
|
author='Sarah Floris',
|
||||||
|
author_email='sdf11c@acu.edu',
|
||||||
|
license='MIT',
|
||||||
|
packages=['Twitter_Tool'],
|
||||||
|
zip_safe=False)
|
Loading…
Reference in New Issue
Block a user