Add files via upload (#120)

2024-10-05 20:39:28 +00:00 · 2020-03-17 09:55:48 -07:00 · 2020-03-17 09:55:48 -07:00 · f016839ad0
commit f016839ad0
parent ded3ba322d
7 changed files with 358 additions and 0 deletions
--- a/Tweets_Tool/LICENSE
+++ b/Tweets_Tool/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Sarah Floris
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/Tweets_Tool/README.md
+++ b/Tweets_Tool/README.md
@ -0,0 +1,71 @@
+# Tweets_Tool
+
+Twitter Official API has limitations on how many tweets you can search for at a time and for the length of time (7 days/10 days). If I wanted to some historical tweets matching a certain criteria, I would have to either buy the Enterprise API or get GNIP, which both cost moneys. 
+
+## Prerequisites
+Since Python 2.x will be deprecated, this package assumes Python 3.x.
+Expected package dependencies are listed in the "requirements.txt" file for PIP, you need to run the following command to get dependencies:
+pip install -r requirements.txt
+
+## Components
+**Tweet**: Model class to give some informations about a specific tweet.  
+* id (str)  
+* permalink (str)  
+* username (str)  
+* text (str)  
+* date (date)  
+* retweets (int)  
+* favorites (int)  
+* mentions (str)  
+* hashtags (str)  
+* geo (str)  
+
+**TweetManager:** A manager class to help getting tweets in Tweet's model  
+* getTweets: Return the list of tweets retrieved by using an instance of TwitterCriteria  
+* getJsonReponse: Actually obtains the tweets and returns an object that can be read    
+ 
+**TwitterCriteria:** A collection of search parameters to be used together with TweetManager   
+* setUsername (str): An optional specific username from a twitter account. Without "@" 
+* setSince (str. "yyyy-mm-dd"): A lower bound date to restrict search  
+* setUntil (str. "yyyy-mm-dd"): An upper bound date to restrist search  
+* setQuerySearch (str): A query text to be matched   
+* setTopTweets (bool): If True only the Top Tweets will be retrieved    
+* setNear(str): A reference location area from where tweets were generated   
+* setWithin (str): A distance radius from "near" location (e.g. 15mi)  
+* setMaxTweets (int): The maximum number of tweets to be retrieved. If this number is unsetted or lower than 1 all possible tweets will be retrieved.    
+
+**TweetObtain:** Returns a clean dataframe for analysis using TweetCriteria and TweetManager
+* TweetObtain_function: Returns a clean dataframe for analysis using TweetCriteria and TweetManager 
+
+## Simple examples of python usage
+
+* Get tweets by username
+``` python
+	tweetCriteria = Tool.TweetCriteria().setUsername('barackobama').setMaxTweets(1)
+	tweet = Tool.TweetManager.getTweets(tweetCriteria)[0]
+	tweets = pd.read_csv('tweets.csv')
+	print(tweets)
+```    
+* Get tweets by query search
+``` python
+	tweetCriteria = Tool.TweetCriteria().setQuerySearch('europe refugees').setSince("2015-05-01").setUntil("2015-09-30").setMaxTweets(1)
+	tweet = Tool.TweetManager.getTweets(tweetCriteria)[0]
+	tweets = pd.read_csv('tweets.csv')
+	print(tweets)
+```    
+* Get tweets by username and bound dates
+``` python
+	tweetCriteria = Tool.TweetCriteria().setUsername("barackobama").setSince("2015-09-10").setUntil("2015-09-12").setMaxTweets(1)
+	tweet = Tool.TweetManager.getTweets(tweetCriteria)[0]
+	tweets = pd.read_csv('tweets.csv')
+	print(tweets)
+```
+* Get the last 10 top tweets by username
+``` python
+	tweetCriteria = Tool.TweetCriteria().setUsername("barackobama").setTopTweets(True).setMaxTweets(10)
+	# first one
+	tweet = Tool.TweetManager.getTweets(tweetCriteria)[0]
+	tweets = pd.read_csv('tweets.csv')
+	print(tweets)
+```
+
--- a/Tweets_Tool/Tool.py
+++ b/Tweets_Tool/Tool.py
@ -0,0 +1,224 @@
+try:
+	import http.cookiejar as cookielib
+except ImportError:
+	import cookielib
+
+import urllib, json, re, datetime, sys
+import pandas as pd
+import numpy as np
+from pyquery import PyQuery
+
+class TweetCriteria:
+	def __init__(self):
+		self.maxTweets = 0
+
+	def setUsername(self, username):
+		self.username = username
+		return self
+
+	def setSince(self, since):
+		self.since = since
+		return self
+
+	def setUntil(self, until):
+		self.until = until
+		return self
+
+	def setQuerySearch(self, querySearch):
+		self.querySearch = querySearch
+		return self
+
+	def setMaxTweets(self, maxTweets):
+		self.maxTweets = maxTweets
+		return self
+
+	def setLang(self, Lang):
+		self.lang = Lang
+		return self
+
+	def setTopTweets(self, topTweets):
+		self.topTweets = topTweets
+		return self
+
+
+class Tweet:
+	def __init__(self):
+		pass
+
+
+class TweetManager:
+	def __init__(self):
+		pass
+
+	@staticmethod
+	def getTweets(tweetCriteria,
+	              receiveBuffer=None,
+	              bufferLength=100,
+	              proxy=None):
+		"""Return the list of tweets retrieved by using an instance of TwitterCriteria"""
+		refreshCursor = ''
+		results = []
+		resultsAux = []
+		cookieJar = cookielib.CookieJar()
+
+		if hasattr(
+		    tweetCriteria,
+		    'username') and (tweetCriteria.username.startswith("\'")
+		                     or tweetCriteria.username.startswith("\"")) and (
+		                         tweetCriteria.username.endswith("\'")
+		                         or tweetCriteria.username.endswith("\"")):
+			tweetCriteria.username = tweetCriteria.username[1:-1]
+
+		active = True
+
+		while active:
+			json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor,
+			                                   cookieJar, proxy)
+			if len(json['items_html'].strip()) == 0:
+				break
+
+			refreshCursor = json['min_position']
+			scrapedTweets = PyQuery(json['items_html'])
+			#Remove incomplete tweets withheld by Twitter Guidelines
+			scrapedTweets.remove('div.withheld-tweet')
+			tweets = scrapedTweets('div.js-stream-tweet')
+
+			if len(tweets) == 0:
+				break
+
+			for tweetHTML in tweets:
+				tweetPQ = PyQuery(tweetHTML)
+				tweet = Tweet()
+
+				usernameTweet = tweetPQ("span:first.username.u-dir b").text()
+				txt = re.sub(r"\s+", " ",
+				             tweetPQ("p.js-tweet-text").text().replace(
+				                 '# ', '#').replace('@ ', '@'))
+				retweets = int(
+				    tweetPQ(
+				        "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount"
+				    ).attr("data-tweet-stat-count").replace(",", ""))
+				favorites = int(
+				    tweetPQ(
+				        "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount"
+				    ).attr("data-tweet-stat-count").replace(",", ""))
+				dateSec = int(
+				    tweetPQ("small.time span.js-short-timestamp").attr(
+				        "data-time"))
+				id = tweetPQ.attr("data-tweet-id")
+				permalink = tweetPQ.attr("data-permalink-path")
+
+				geo = ''
+				geoSpan = tweetPQ('span.Tweet-geo')
+				if len(geoSpan) > 0:
+					geo = geoSpan.attr('title')
+
+				tweet.id = id
+				tweet.permalink = 'https://twitter.com' + permalink
+				tweet.username = usernameTweet
+				tweet.text = txt
+				tweet.date = datetime.datetime.fromtimestamp(dateSec)
+				tweet.retweets = retweets
+				tweet.favorites = favorites
+				tweet.mentions = " ".join(
+				    re.compile('(@\\w*)').findall(tweet.text))
+				tweet.hashtags = " ".join(
+				    re.compile('(#\\w*)').findall(tweet.text))
+				tweet.geo = geo
+
+				results.append(tweet)
+				resultsAux.append(tweet)
+
+				if receiveBuffer and len(resultsAux) >= bufferLength:
+					receiveBuffer(resultsAux)
+					resultsAux = []
+
+				if tweetCriteria.maxTweets > 0 and len(
+				    results) >= tweetCriteria.maxTweets:
+					active = False
+					break
+
+		if receiveBuffer and len(resultsAux) > 0:
+			receiveBuffer(resultsAux)
+
+		return results
+
+	@staticmethod
+	def getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy):
+		"""Actually obtains the tweets and returns an object that can be read"""
+		url = "https://twitter.com/i/search/timeline?f=tweets&q=%s&src=typd&max_position=%s"
+
+		urlGetData = ''
+
+		if hasattr(tweetCriteria, 'username'):
+			urlGetData += ' from:' + tweetCriteria.username
+
+		if hasattr(tweetCriteria, 'querySearch'):
+			urlGetData += ' ' + tweetCriteria.querySearch
+
+		if hasattr(tweetCriteria, 'near'):
+			urlGetData += "&near:" + tweetCriteria.near + " within:" + tweetCriteria.within
+
+		if hasattr(tweetCriteria, 'since'):
+			urlGetData += ' since:' + tweetCriteria.since
+
+		if hasattr(tweetCriteria, 'until'):
+			urlGetData += ' until:' + tweetCriteria.until
+
+		if hasattr(tweetCriteria, 'topTweets'):
+			if tweetCriteria.topTweets:
+				url = "https://twitter.com/i/search/timeline?q=%s&src=typd&max_position=%s"
+		url = url % (urllib.parse.quote(urlGetData), refreshCursor)
+
+		headers = [('Host', "twitter.com"),
+		           ('User-Agent', "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"),
+		           ('Accept',
+		            "application/json, text/javascript, */*; q=0.01"),
+		           ('Accept-Language', "de,en-US;q=0.7,en;q=0.3"),
+		           ('X-Requested-With',
+		            "XMLHttpRequest"), ('Referer', url), ('Connection',
+		                                                  "keep-alive")]
+		if proxy:
+			opener = urllib.request.build_opener(
+			    urllib.request.ProxyHandler({
+			        'http': proxy,
+			        'https': proxy
+			    }), urllib.HTTPCookieProcessor(cookieJar))
+		else:
+			opener = urllib.request.build_opener(
+			    urllib.request.HTTPCookieProcessor(cookieJar))
+		opener.addheaders = headers
+
+		try:
+			response = opener.open(url)
+			jsonResponse = response.read()
+		except:
+			print(
+			    "Twitter weird response. Try to see on browser: https://twitter.com/search?q=%s&src=typd"
+			    % urllib.parse.quote(urlGetData))
+			sys.exit()
+			return
+		dataJson = json.loads(jsonResponse)
+
+		return dataJson
+
+
+class TweetObtain:
+	def __init__(self):
+		pass
+	def TweetObtain_function(self,videogame):
+		"""Returns a clean dataframe for analysis using TweetCriteria and TweetManager"""
+		print(videogame)
+		tweet_date = []
+		tweet_text = []
+		tweetCriteria = TweetCriteria().setQuerySearch(videogame[0]). \
+		setSince(videogame[1]).setUntil(videogame[2]).setMaxTweets(1000)
+		tweets = TweetManager().getTweets(tweetCriteria)
+		for tweet in tweets:
+			tweet_date.append(tweet.date)
+			tweet_text.append(tweet.text)
+		df = pd.DataFrame(np.column_stack((tweet_date, tweet_text)))
+		df['name'] = videogame[0]
+		df['start_date'] = videogame[1]
+		df['end_date'] = videogame[2]
+		return df
--- a/Tweets_Tool/example.csv
+++ b/Tweets_Tool/example.csv
@ -0,0 +1,2 @@
+0,1,2,3
+0,Bayonetta 2,18 Oct 85  ,18 Oct 85
--- a/Tweets_Tool/main.py
+++ b/Tweets_Tool/main.py
@ -0,0 +1,24 @@
+import sys
+assert sys.version_info >= (3, 0)
+import pandas as pd
+import numpy as np
+from multiprocessing.pool import ThreadPool
+import Tool
+
+
+def main(filepath):
+	assert isinstance(filepath, str)
+	videogames = pd.read_csv(filepath, skiprows=1, names=['games', 'start_date', 'end_date'])
+	videogames = videogames.values
+	pool = ThreadPool(250)
+	text_results = pool.map(Tool.TweetObtain().TweetObtain_function,
+	                        videogames)
+	pool.close()
+	pool.join()
+	text_results = pd.DataFrame(
+	    np.vstack(text_results))
+	text_results.to_csv('tweets.csv')
+	return
+
+if __name__ == "__main__":
+	main(sys.argv[1]) 
--- a/Tweets_Tool/requirements.txt
+++ b/Tweets_Tool/requirements.txt
@ -0,0 +1,5 @@
+requests==2.18.4
+urllib3==1.22
+py4j==0.10.4
+BeautifulSoup==3.2.0
+numpy==1.9.2
--- a/Tweets_Tool/setup.py
+++ b/Tweets_Tool/setup.py
@ -0,0 +1,11 @@
+from setuptools import setup
+
+setup(name='Twitter_Tool',
+      version='0.1',
+      description='Web scraping ',
+      url='http://github.com/sdf94/Twitter_Tool',
+      author='Sarah Floris',
+      author_email='sdf11c@acu.edu',
+      license='MIT',
+      packages=['Twitter_Tool'],
+      zip_safe=False)