Add files via upload (#120)

This commit is contained in:
Sarah Floris 2020-03-17 09:55:48 -07:00 committed by GitHub
parent ded3ba322d
commit f016839ad0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 358 additions and 0 deletions

21
Tweets_Tool/LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2018 Sarah Floris
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

71
Tweets_Tool/README.md Normal file
View File

@ -0,0 +1,71 @@
# Tweets_Tool
Twitter Official API has limitations on how many tweets you can search for at a time and for the length of time (7 days/10 days). If I wanted to some historical tweets matching a certain criteria, I would have to either buy the Enterprise API or get GNIP, which both cost moneys.
## Prerequisites
Since Python 2.x will be deprecated, this package assumes Python 3.x.
Expected package dependencies are listed in the "requirements.txt" file for PIP, you need to run the following command to get dependencies:
pip install -r requirements.txt
## Components
**Tweet**: Model class to give some informations about a specific tweet.
* id (str)
* permalink (str)
* username (str)
* text (str)
* date (date)
* retweets (int)
* favorites (int)
* mentions (str)
* hashtags (str)
* geo (str)
**TweetManager:** A manager class to help getting tweets in Tweet's model
* getTweets: Return the list of tweets retrieved by using an instance of TwitterCriteria
* getJsonReponse: Actually obtains the tweets and returns an object that can be read
**TwitterCriteria:** A collection of search parameters to be used together with TweetManager
* setUsername (str): An optional specific username from a twitter account. Without "@"
* setSince (str. "yyyy-mm-dd"): A lower bound date to restrict search
* setUntil (str. "yyyy-mm-dd"): An upper bound date to restrist search
* setQuerySearch (str): A query text to be matched
* setTopTweets (bool): If True only the Top Tweets will be retrieved
* setNear(str): A reference location area from where tweets were generated
* setWithin (str): A distance radius from "near" location (e.g. 15mi)
* setMaxTweets (int): The maximum number of tweets to be retrieved. If this number is unsetted or lower than 1 all possible tweets will be retrieved.
**TweetObtain:** Returns a clean dataframe for analysis using TweetCriteria and TweetManager
* TweetObtain_function: Returns a clean dataframe for analysis using TweetCriteria and TweetManager
## Simple examples of python usage
* Get tweets by username
``` python
tweetCriteria = Tool.TweetCriteria().setUsername('barackobama').setMaxTweets(1)
tweet = Tool.TweetManager.getTweets(tweetCriteria)[0]
tweets = pd.read_csv('tweets.csv')
print(tweets)
```
* Get tweets by query search
``` python
tweetCriteria = Tool.TweetCriteria().setQuerySearch('europe refugees').setSince("2015-05-01").setUntil("2015-09-30").setMaxTweets(1)
tweet = Tool.TweetManager.getTweets(tweetCriteria)[0]
tweets = pd.read_csv('tweets.csv')
print(tweets)
```
* Get tweets by username and bound dates
``` python
tweetCriteria = Tool.TweetCriteria().setUsername("barackobama").setSince("2015-09-10").setUntil("2015-09-12").setMaxTweets(1)
tweet = Tool.TweetManager.getTweets(tweetCriteria)[0]
tweets = pd.read_csv('tweets.csv')
print(tweets)
```
* Get the last 10 top tweets by username
``` python
tweetCriteria = Tool.TweetCriteria().setUsername("barackobama").setTopTweets(True).setMaxTweets(10)
# first one
tweet = Tool.TweetManager.getTweets(tweetCriteria)[0]
tweets = pd.read_csv('tweets.csv')
print(tweets)
```

224
Tweets_Tool/Tool.py Normal file
View File

@ -0,0 +1,224 @@
try:
import http.cookiejar as cookielib
except ImportError:
import cookielib
import urllib, json, re, datetime, sys
import pandas as pd
import numpy as np
from pyquery import PyQuery
class TweetCriteria:
def __init__(self):
self.maxTweets = 0
def setUsername(self, username):
self.username = username
return self
def setSince(self, since):
self.since = since
return self
def setUntil(self, until):
self.until = until
return self
def setQuerySearch(self, querySearch):
self.querySearch = querySearch
return self
def setMaxTweets(self, maxTweets):
self.maxTweets = maxTweets
return self
def setLang(self, Lang):
self.lang = Lang
return self
def setTopTweets(self, topTweets):
self.topTweets = topTweets
return self
class Tweet:
def __init__(self):
pass
class TweetManager:
def __init__(self):
pass
@staticmethod
def getTweets(tweetCriteria,
receiveBuffer=None,
bufferLength=100,
proxy=None):
"""Return the list of tweets retrieved by using an instance of TwitterCriteria"""
refreshCursor = ''
results = []
resultsAux = []
cookieJar = cookielib.CookieJar()
if hasattr(
tweetCriteria,
'username') and (tweetCriteria.username.startswith("\'")
or tweetCriteria.username.startswith("\"")) and (
tweetCriteria.username.endswith("\'")
or tweetCriteria.username.endswith("\"")):
tweetCriteria.username = tweetCriteria.username[1:-1]
active = True
while active:
json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor,
cookieJar, proxy)
if len(json['items_html'].strip()) == 0:
break
refreshCursor = json['min_position']
scrapedTweets = PyQuery(json['items_html'])
#Remove incomplete tweets withheld by Twitter Guidelines
scrapedTweets.remove('div.withheld-tweet')
tweets = scrapedTweets('div.js-stream-tweet')
if len(tweets) == 0:
break
for tweetHTML in tweets:
tweetPQ = PyQuery(tweetHTML)
tweet = Tweet()
usernameTweet = tweetPQ("span:first.username.u-dir b").text()
txt = re.sub(r"\s+", " ",
tweetPQ("p.js-tweet-text").text().replace(
'# ', '#').replace('@ ', '@'))
retweets = int(
tweetPQ(
"span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
).attr("data-tweet-stat-count").replace(",", ""))
favorites = int(
tweetPQ(
"span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
).attr("data-tweet-stat-count").replace(",", ""))
dateSec = int(
tweetPQ("small.time span.js-short-timestamp").attr(
"data-time"))
id = tweetPQ.attr("data-tweet-id")
permalink = tweetPQ.attr("data-permalink-path")
geo = ''
geoSpan = tweetPQ('span.Tweet-geo')
if len(geoSpan) > 0:
geo = geoSpan.attr('title')
tweet.id = id
tweet.permalink = 'https://twitter.com' + permalink
tweet.username = usernameTweet
tweet.text = txt
tweet.date = datetime.datetime.fromtimestamp(dateSec)
tweet.retweets = retweets
tweet.favorites = favorites
tweet.mentions = " ".join(
re.compile('(@\\w*)').findall(tweet.text))
tweet.hashtags = " ".join(
re.compile('(#\\w*)').findall(tweet.text))
tweet.geo = geo
results.append(tweet)
resultsAux.append(tweet)
if receiveBuffer and len(resultsAux) >= bufferLength:
receiveBuffer(resultsAux)
resultsAux = []
if tweetCriteria.maxTweets > 0 and len(
results) >= tweetCriteria.maxTweets:
active = False
break
if receiveBuffer and len(resultsAux) > 0:
receiveBuffer(resultsAux)
return results
@staticmethod
def getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy):
"""Actually obtains the tweets and returns an object that can be read"""
url = "https://twitter.com/i/search/timeline?f=tweets&q=%s&src=typd&max_position=%s"
urlGetData = ''
if hasattr(tweetCriteria, 'username'):
urlGetData += ' from:' + tweetCriteria.username
if hasattr(tweetCriteria, 'querySearch'):
urlGetData += ' ' + tweetCriteria.querySearch
if hasattr(tweetCriteria, 'near'):
urlGetData += "&near:" + tweetCriteria.near + " within:" + tweetCriteria.within
if hasattr(tweetCriteria, 'since'):
urlGetData += ' since:' + tweetCriteria.since
if hasattr(tweetCriteria, 'until'):
urlGetData += ' until:' + tweetCriteria.until
if hasattr(tweetCriteria, 'topTweets'):
if tweetCriteria.topTweets:
url = "https://twitter.com/i/search/timeline?q=%s&src=typd&max_position=%s"
url = url % (urllib.parse.quote(urlGetData), refreshCursor)
headers = [('Host', "twitter.com"),
('User-Agent', "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"),
('Accept',
"application/json, text/javascript, */*; q=0.01"),
('Accept-Language', "de,en-US;q=0.7,en;q=0.3"),
('X-Requested-With',
"XMLHttpRequest"), ('Referer', url), ('Connection',
"keep-alive")]
if proxy:
opener = urllib.request.build_opener(
urllib.request.ProxyHandler({
'http': proxy,
'https': proxy
}), urllib.HTTPCookieProcessor(cookieJar))
else:
opener = urllib.request.build_opener(
urllib.request.HTTPCookieProcessor(cookieJar))
opener.addheaders = headers
try:
response = opener.open(url)
jsonResponse = response.read()
except:
print(
"Twitter weird response. Try to see on browser: https://twitter.com/search?q=%s&src=typd"
% urllib.parse.quote(urlGetData))
sys.exit()
return
dataJson = json.loads(jsonResponse)
return dataJson
class TweetObtain:
def __init__(self):
pass
def TweetObtain_function(self,videogame):
"""Returns a clean dataframe for analysis using TweetCriteria and TweetManager"""
print(videogame)
tweet_date = []
tweet_text = []
tweetCriteria = TweetCriteria().setQuerySearch(videogame[0]). \
setSince(videogame[1]).setUntil(videogame[2]).setMaxTweets(1000)
tweets = TweetManager().getTweets(tweetCriteria)
for tweet in tweets:
tweet_date.append(tweet.date)
tweet_text.append(tweet.text)
df = pd.DataFrame(np.column_stack((tweet_date, tweet_text)))
df['name'] = videogame[0]
df['start_date'] = videogame[1]
df['end_date'] = videogame[2]
return df

2
Tweets_Tool/example.csv Normal file
View File

@ -0,0 +1,2 @@
0,1,2,3
0,Bayonetta 2,18 Oct 85 ,18 Oct 85
1 0 1 2 3
2 0 Bayonetta 2 18 Oct 85 18 Oct 85

24
Tweets_Tool/main.py Normal file
View File

@ -0,0 +1,24 @@
import sys
assert sys.version_info >= (3, 0)
import pandas as pd
import numpy as np
from multiprocessing.pool import ThreadPool
import Tool
def main(filepath):
assert isinstance(filepath, str)
videogames = pd.read_csv(filepath, skiprows=1, names=['games', 'start_date', 'end_date'])
videogames = videogames.values
pool = ThreadPool(250)
text_results = pool.map(Tool.TweetObtain().TweetObtain_function,
videogames)
pool.close()
pool.join()
text_results = pd.DataFrame(
np.vstack(text_results))
text_results.to_csv('tweets.csv')
return
if __name__ == "__main__":
main(sys.argv[1])

View File

@ -0,0 +1,5 @@
requests==2.18.4
urllib3==1.22
py4j==0.10.4
BeautifulSoup==3.2.0
numpy==1.9.2

11
Tweets_Tool/setup.py Normal file
View File

@ -0,0 +1,11 @@
from setuptools import setup
setup(name='Twitter_Tool',
version='0.1',
description='Web scraping ',
url='http://github.com/sdf94/Twitter_Tool',
author='Sarah Floris',
author_email='sdf11c@acu.edu',
license='MIT',
packages=['Twitter_Tool'],
zip_safe=False)