Twitter ist ein soziales Netzwerk zur Verbreitung von Kurznachrichten. Innerhalb von wenigen Sekunden kann man Menschen auf der ganzen Welt an seinen aktuellen Erlebnissen teilhaben lassen. Die einzige Limitierung: Eine Nachricht - ein sogenannter Tweet - darf maximal 140 Zeichen beinhalten.
Die Microblogging Platform Twitter ist weltweit sehr beliebt und hat über 320 Millionen aktive Nutzer (Stand Januar 2016), die ca. 500 Millionen Tweets am Tag verbreiten. Das ist eine unglaubliche Menge an Daten, die über einen beliebig langen Zeitraum aufgezeichnet werden können. Neben den eigentlichen Texten enthält ein Tweet auch Meta-Daten, wie z.B. den Nutzernamen, die Position des Nutzers (sofern aktiviert), und viele mehr. Somit eignet sich Twitter perfekt für die Untersuchung verschiedenster Problemstellungen im Zusammenhang von Big Data.
Das Ziel dieser Seminararbeit war es, mit Hilfe von Twitter-Hashtags eine geographische,zeitliche Trendanalyse vorzunehmen und auf einer interaktiven Weltkarte zu visualisieren.
{"created_at":"Fri Oct 24 00:00:59 +0000 2014","id":525436758037913600,"id_str":"525436758037913600","text":"Haciendo lo imposible para no dormirme, el amor es m\u00e1s fuerte... \u00a1Dale #Boca carajo!","source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":71515259,"id_str":"71515259","name":"Fede Gallardo","screen_name":"gallardof","location":"Tigre, Arg. Madrid, Esp.","url":"http:\/\/gallardof.wordpress.com","description":"Falso escritor. Hincha de #Boca. Amante del Norte y del Sur, la pesca y el mate. Admirador del flaco #Sabina y del gran Eddie #Vedder. Corredor por elecci\u00f3n.","protected":false,"verified":false,"followers_count":345,"friends_count":194,"listed_count":1,"favourites_count":191,"statuses_count":598,"created_at":"Fri Sep 04 11:44:44 +0000 2009","utc_offset":-10800,"time_zone":"Buenos Aires","geo_enabled":true,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/524499377336094721\/6ZyVrpWt_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/524499377336094721\/6ZyVrpWt_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/71515259\/1405390550","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":{"id":"ea520f3528c9b3e4","url":"https:\/\/api.twitter.com\/1.1\/geo\/id\/ea520f3528c9b3e4.json","place_type":"admin","name":"Madrid","full_name":"Madrid, Comunidad de Madrid","country_code":"ES","country":"Espa\u00f1a","bounding_box":{"type":"Polygon","coordinates":[[[-4.5791745,39.8845366],[-4.5791745,41.1649106],[-3.0531322,41.1649106],[-3.0531322,39.8845366]]]},"attributes":{}},"contributors":null,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"Boca","indices":[71,76]}],"trends":[],"urls":[],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"es","timestamp_ms":"1414108859923"}
import json
import gzip
from lib import Tweet
import sqlite3 as lite
import sys
from os import listdir
from os.path import isfile, join, isdir
import datetime
def loadDocument(path):
tweets = []
with gzip.open(path, 'rb') as f:
for line in f:
item = json.loads(line)
if item['entities']['hashtags'] != []:
if item['geo'] != None and item['geo']['coordinates']!=None:
t_id = item['id']
t_date = item['created_at']
t_latitude = item['geo']['coordinates'][1]
t_longitude = item['geo']['coordinates'][0]
t_hashtags = []
for hashtag in item['entities']['hashtags']:
t_hashtags.append(hashtag['text'])
t_text = item['text']
tweets.append(Tweet.Data(t_id,t_date,t_latitude,t_longitude,t_hashtags,t_text))
elif item['place']!=None and item['place']['bounding_box']!= None and item['place']['bounding_box']['coordinates'] != None and item['place']['bounding_box']['coordinates'] != []:
t_id = item['id']
t_date = item['created_at']
t_latitude = ((item['place']['bounding_box']['coordinates'][0][0][0]+item['place']['bounding_box']['coordinates'][0][2][0])/2)
t_longitude = ((item['place']['bounding_box']['coordinates'][0][0][1]+item['place']['bounding_box']['coordinates'][0][2][1])/2)
t_hashtags = []
for hashtag in item['entities']['hashtags']:
t_hashtags.append(hashtag['text'])
t_text = item['text']
tweets.append(Tweet.Data(t_id,t_date,t_latitude,t_longitude,t_hashtags,t_text))
return tweets
def addToDatabase(tweets):
try:
db = lite.connect('database/Twitter.sqlite')
cur = db.cursor()
for tweet in tweets:
cur.execute("INSERT INTO tweets (id,date,latitude,longitude,text) VALUES (?,?,?,?,?)" , (tweet.id,tweet.date,tweet.position.latitude,tweet.position.longitude,tweet.text))
for hashtag in tweet.hashtags:
cur.execute("INSERT INTO hashtags (tweet,hashtag) VALUES (?,?)" , (tweet.id,hashtag))
except lite.Error, e:
print "Error %s:" % e.args[0]
sys.exit(1)
finally:
if db:
db.commit()
db.close()
def main():
filepath=''
if(len(sys.argv)>1):
filepath=sys.argv[1]
print "Reading from " + filepath
startfolder=''
if(len(sys.argv)>2):
startfolder=sys.argv[2]
lastTime=''
if(len(sys.argv)>3):
lastTime=sys.argv[3]
print "Starting at " + startfolder
folders = [folder for folder in listdir(filepath) if isdir(join(filepath,folder))]
for folder in folders:
lookAtFolder=1
if(folder>=startfolder):
files = [f for f in listdir(join(filepath,folder)) if isfile(join(filepath,join(folder,f)))]
for f in files:
splitname=f.split(".")
if(len(splitname)>2):
if((folder>startfolder or splitname[0]>lastTime) and splitname[2] == "gz"):
hasFoundEntryPoint=1
addToDatabase(loadDocument(filepath+'/'+folder+'/'+f))
print "Document", folder, "/", f, "imported"
main()
$ python loadFile.py (optional: Starttag) (optional: Starttageszeit)
variance=(sqSum/dayCount)-((sum/dayCount)*(sum/dayCount))
{"setup":{"startYear":2014.0, "startMonth":9.0, "startDay":16.0, "startHour":0.0, "startMinute":0.0, "range":40.0, "step": "day", "avgdistance":367.97013152963467}, "startPoint":[{"latitude":54.170359,"longitude":-8.597168}],"D-24-09-2014":[{"latitude":-23.487694,"longitude":-47.449265}]{"latitude":-22.860791,"longitude":-43.254918}]{"latitude":-23.487652,"longitude":-47.448492}]{"latitude":-23.955464,"longitude":-46.5075055}]
import sqlite3 as lite
import sys
from lib import Tweet
import datetime
from math import sin, cos, sqrt, atan2, radians
import operator
def performQuery():
print("Perform query")
db = None
try:
chosenHashtags=[]
variance=0
print("Start connection.")
db = lite.connect('C:/Users/Michael/Desktop/Seminar Data Science/database/Twitter.sqlite')
print("Connected")
cur = db.cursor()
dayCount=0
print("Day count is ",dayCount)
print("Get number of days")
cur.execute("SELECT date FROM tweets")
dates = cur.fetchall()
days=[]
for date in dates:
days.append(convertDate(date[0]))
dayCount=len(set(days))
print("Number of different days: ",dayCount)
print("Start fetching hashtags:")
cur.execute("SELECT hashtag, count FROM (SELECT hashtag, COUNT(*) AS `count` FROM hashtags GROUP BY hashtag ORDER BY count DESC) WHERE count>400")
hashtags = cur.fetchall()
print("Fetched")
hashtagScore={}
print("Getting scores of 500 most used hashtags:")
for i in range(500):
tag=(hashtags[i][0])
print("Hashtag ",i,": ")
hashtagScore[tag]=0
cur.execute("SELECT date FROM ((((SELECT hashtag, tweet FROM hashtags WHERE hashtag=?) AS tags) JOIN tweets ON tags.tweet=tweets.id))",[tag])
occurences=cur.fetchall()
print("Hashtag ",i," appears ",len(occurences)," times")
print("Create dictionary for hashtag ",i)
dateDict={}
for tweet in occurences:
arr=tweet[0].split(" ")
shortD=arr[0]+"_"+arr[1]+" "+arr[2]
if(shortD in dateDict):
dateDict[shortD]+=1
else:
dateDict[shortD]=1
print("Done creating dictionary for hashtag ",i, ". Its length is ",len(dateDict))
sum=0
sqSum=0
print("Getting variance over each day for hashtag ",i)
for key,val in dateDict.items():
sum+=val
sqSum+=(val*val)
print("Get count of hashtag")
cur.execute("SELECT COUNT(*) FROM hashtags WHERE hashtag=?",[tag])
count=cur.fetchone()
print("Total count is ",count[0])
variance=(sqSum/dayCount)-((sum/dayCount)*(sum/dayCount))
print("Variance of hashtag ",i," is: ", variance)
if(variance<0):
variance=(-variance)
hashtagScore[(hashtags[i][0])] = variance
print("Score of hashtag ",i," is: ",hashtagScore[hashtags[i][0]])
print("Ranked all Hashtags.")
print("Sort hashtags by variance:")
sortedByScore = sorted(hashtagScore.items(), key=operator.itemgetter(1), reverse=True)
print("Hashtags have been sorted.")
print("Save tweets for 10 best scoring hashtags:")
for i in range(10): #Nur die 10 am besten bewerteten Hashtags waehlen
print("Getting all tweets of hashtag ",i)
tagI =hashtags[i][0]
cur.execute("SELECT date, latitude, longitude FROM (hashtags JOIN tweets ON hashtags.tweet=tweets.id) WHERE hashtag=?",(tagI,))
tweets = cur.fetchall()
print("Saving all tweets of hashtag ",i," to JSONFile")
writeJsonFile(tagI,tweets, i, variance)
print("Done with hashtag ",i)
print("Finished program")
except lite.Error as e:
print ("Error %s:" % e.args[0])
sys.exit(1)
finally:
if db:
db.commit()
db.close()
def getRange(start, end):
s_min = int(start.split("-")[1])
s_hou = int(start.split("-")[2])
s_day = int(start.split("-")[3])
s_mon = int(start.split("-")[4])
s_yea = int(start.split("-")[5])
s = datetime.datetime(s_yea,s_mon,s_day,s_hou,s_min)
e_min = int(end.split("-")[1])
e_hou = int(end.split("-")[2])
e_day = int(end.split("-")[3])
e_mon = int(end.split("-")[4])
e_yea = int(end.split("-")[5])
e = datetime.datetime(e_yea,e_mon,e_day,e_hou,e_min)
output = str((((e - s).total_seconds()/60)/60/24))
return output
def writeJsonFile(name, tweets, index, variance):
print("Set up output")
output= "{\"setup\":{"
print("Get date")
start = tweets[0]
end=tweets[len(tweets)-1]
duration = getRange((convertDate(start[0])), (convertDate(end[0])))
startDate=convertDate(start[0]).split("-")
startYear=float(startDate[5])
startMonth=float(startDate[4])
startDay=float(startDate[3])
print("Get distances")
avgDistance= getavaragedistance(tweets)
strvariance=str(variance)
if strvariance=="0":
strvariance="None"
output+="\"startYear\":"+str(startYear)+", \"startMonth\":"+str(startMonth)+", \"startDay\":"+str(startDay)+", \"startHour\":0.0"+", \"startMinute\":0.0"+", \"range\":"+str(duration)+", \"step\": \"day\", \"avgdistance\":"+str(avgDistance)+", \"dayvariance\":"+strvariance+"}"
output+=", \"startPoint\":["
differentDays = []
print("Writing start point")
output += "{\"latitude\":" + str(start[2]) + ",\"longitude\":" + str(start[1]) + ",\"text\":\"...\"}]"
print("Getting different days")
for tweet in tweets:
differentDays.append(convertDate(tweet[0]))
print("Appending all days")
for day in set(differentDays):
output += ",\"" + day + "\":["
print(day)
for i in range(len(tweets)):
if convertDate(tweets[i][0]) == day:
#print(str(i))
output += "{\"latitude\":" + str(tweets[i][2]) + ",\"longitude\":" + str(tweets[i][1]) + ",\"text\":\"...\"},"
output = output[:-1]
output += "]"
output += "}"
print("Write to file")
open("C:/Users/Michael/Desktop/Seminar Data Science/output/"+str(index)+"-"+name+".json", "w").write(output)
def getavaragedistance(tweets):
n= len(tweets)
R = 6373.0
avgdistance=0
samplesize = 15
partitioncount=(int(n/1000))-1
[...]
$ python createJson.py
lat1 = radians(tweets[x][2])
lon1 = radians(tweets[x][1])
lat2 = radians(tweets[y][2])
lon2 = radians(tweets[y][1])
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
distance = R * c
Die Daten werden auf einer interaktiven Weltkarte visualisiert. Dabei kann der Nutzer zwischen zwei verschiedenen Ansichten wechseln:
var myVarName = {"setup" : {
"startYear": 2016,
"startMonth": 0, // 0-11
"startDay": 1, // 1-31
"startHour": 0, // 0-23
"startMinute": 0, // 0-59
"step": "minute" // values can be: "minute" , "hour" , "day" , "month"
}
,
"startPoint" : [
{
"latitude": 52.520007,
"longitude": 13.404954,
"text": "TWEET TEXT HERE!!"
}
],
// Format: D-MM-HH-DD-MM-YYYY
// D-Minute-Hour-Day-Month-Year
"D-01-10-01-01-2016" : [
{
"latitude": 29.541489,
"longitude": 58.615967,
"text": "TWEET TEXT HERE!!"
},
{
"latitude": -68.26971,
"longitude": -127.25137,
"text": "TWEET TEXT HERE!!"
},
...
{
"latitude": 58.289536,
"longitude": -120.70857,
"text": "TWEET TEXT HERE!!"
}
],
"D-02-10-01-01-2016" : [
{
"latitude": -10.519913,
"longitude": 22.112045,
"text": "TWEET TEXT HERE!!"
},
{
"latitude": -69.38677,
"longitude": 35.46237,
"text": "TWEET TEXT HERE!!"
},
{
"latitude": 19.265137,
"longitude": -66.277084,
"text": "TWEET TEXT HERE!!"
},
{
"latitude": -30.662949,
"longitude": 10.100861,
"text": "TWEET TEXT HERE!!"
},
{
"latitude": 2.71669,
"longitude": 172.61423,
"text": "TWEET TEXT HERE!!"
}
]
...
}
Um die Twitter Daten crawlen zu können, verwenden wir das Tweepy-Framework für Python. Dieses erlaubt unter Verwendung eigens generierter Authentication-Credentials von Twitter Delevoper direkten Zugriff auf die REST API, sowie auf die Streaming API. Auf diese Weise sind wir in der Lage, die Tweets direkt von Twitter beziehen zu können.
Der Live-Betrieb ermöglicht es dem Anwender speziell von ihm ausgewählte Hashtags abzufragen und in unsere Visualisierung einzuspeisen. Dazu muss er sich lediglich über seinen Twitter-Account anmelden und das gewünschte Hashtag eingeben. Unser Skript scannt und verarbeitet die Daten automatisch, sodass die Visualisierung ausgegeben werden kann. Dabei unterscheiden wir zwischen zwei möglichen Fallkonstellationen:
import tweepy
from tweepy import OAuthHandler
import json
import sys
import time
import sqlite3 as lite
from lib import Tweet
# enter twitter-credentials here
consumer_key = '' # App-Key
consumer_secret = '' # App-Secret
access_token = '' # User-token
access_secret = '' # User-Secret
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
def crawlTweets(cursor,query):
""" performing query and storing filtered information into SQL-DB """
# counter for how many tweets have been crawled
ctr = 0
# connect to twitter
api = tweepy.API(auth)
# if it is the first query
if open('lib/lastID.txt','r').read() == "":
# query
tweets = api.search(q=query, count=100)
ctr += 1
# get last id
oldest = tweets[-1].id - 1
# backup storage for further queries
open('lib/lastID.txt','w').write(str(tweets[-1].id - 1))
# check each tweet, if geo data is embedded. If Yes -> Store into DB
for tweet in tweets:
if tweet.geo != None:
t_id = tweet.id
t_date = tweet.created_at
t_latitude = tweet.geo['coordinates'][1]
t_longitude = tweet.geo['coordinates'][0]
t_text = tweet.text
t = Tweet.Data(t_id,t_date,t_latitude,t_longitude,[],t_text)
cursor.execute("INSERT INTO tweets (id,date,latitude,longitude,text) VALUES (?,?,?,?,?)" , (t.id,t.date,t.position.latitude,t.position.longitude,t.text))
print "Tweet added %s" % (t_id)
elif tweet.place != None:
t_id = tweet.id
t_date = tweet.created_at
t_latitude = ((tweet.place.bounding_box.coordinates[0][0][1] + \
tweet.place.bounding_box.coordinates[0][2][1])/2)
t_longitude = ((tweet.place.bounding_box.coordinates[0][0][0] + \
tweet.place.bounding_box.coordinates[0][2][0])/2)
t_text = tweet.text
t = Tweet.Data(t_id,t_date,t_latitude,t_longitude,[],t_text)
cursor.execute("INSERT INTO tweets (id,date,latitude,longitude,text) VALUES (?,?,?,?,?)" , (t.id,t.date,t.position.latitude,t.position.longitude,t.text))
print "Tweet added %s" % (t_id)
tweets = [1,2]
oldest = open('lib/lastID.txt','r').read()
# repeat until there are no more tweets to crawl
while len(tweets) > 0:
#print "getting tweets before %s" % (oldest)
tweets = api.search(q=query,count=100,max_id=oldest)
print "getting tweets before %s" % (tweets[-1].created_at)
ctr += 1
for tweet in tweets:
if tweet.geo != None:
t_id = tweet.id
t_date = tweet.created_at
t_latitude = tweet.geo['coordinates'][0]
t_longitude = tweet.geo['coordinates'][1]
t_text = tweet.text
t = Tweet.Data(t_id,t_date,t_latitude,t_longitude,[],t_text)
cursor.execute("INSERT INTO tweets (id,date,latitude,longitude,text) VALUES (?,?,?,?,?)" , (t.id,t.date,t.position.latitude,t.position.longitude,t.text))
print "Tweet added %s" % (t_id)
elif tweet.place != None:
t_id = tweet.id
t_date = tweet.created_at
t_latitude = ((tweet.place.bounding_box.coordinates[0][0][1] + \
tweet.place.bounding_box.coordinates[0][2][1])/2)
t_longitude = ((tweet.place.bounding_box.coordinates[0][0][0] + \
tweet.place.bounding_box.coordinates[0][2][0])/2)
t_text = tweet.text
t = Tweet.Data(t_id,t_date,t_latitude,t_longitude,[],t_text)
cursor.execute("INSERT INTO tweets (id,date,latitude,longitude,text) VALUES (?,?,?,?,?)" , (t.id,t.date,t.position.latitude,t.position.longitude,t.text))
print "Tweet added %s" % (t_id)
print "...%s tweets downloaded so far" % (ctr*100)
oldest = tweets[-1].id - 1
#backup storage
open('lib/lastID.txt','w').write(str(tweets[-1].id - 1))
# cannot crawl more tweets in one session
if ctr == 180:
print "...crawling paused - limit reached"
break
def queryFor(query):
try:
db = lite.connect('database/restDB.sqlite')
cursor = db.cursor()
crawlTweets(cursor,query)
except lite.Error, e:
print "Error %s:" % e.args[0]
sys.exit(1)
finally:
if db:
db.commit()
db.close()
def main():
""" connect to db. start crawling """
while 1:
queryFor('Brexit') # Choose Hashtag
# wait 15 minutes to avoid timeout
print "Wait for Twitter-Timout to expire. Then automatically restart"
time.sleep(300)
print "10 more minutes"
time.sleep(300)
print "5 more minutes"
time.sleep(240)
print "1 more minute"
time.sleep(70)
main()
$ python RestCrawler.py
import sqlite3 as lite
import sys
import datetime
import math
from lib import Tweet
class Init():
def loadDatabase(self,path):
db_tweets = []
try:
db = lite.connect(path)
cur = db.cursor()
cur.execute("SELECT * FROM tweets")
data = cur.fetchall()
for tweet in data:
db_tweets.append(Tweet.Data(tweet[0],tweet[1],tweet[2],tweet[3],[],tweet[4]))
return db_tweets
except lite.Error, e:
print "Error %s:" % e.args[0]
sys.exit(1)
finally:
if db:
db.close()
def Format1(self,tweets=[]):
""" Return Json-Coordinates formatted for each Day """
if tweets != []:
start = tweets.pop(0)
output = "{\"setup\":"
output += "{ \"startYear\": " + str(start.date.split(" ")[0].split("-")[0])
output += ", \"startMonth\": " + str(int(start.date.split(" ")[0].split("-")[1]) - 1)
output += ", \"startDay\": " + str(int(start.date.split(" ")[0].split("-")[2]))
output += ", \"startHour\": " + str(int(start.date.split(" ")[1].split(":")[0]))
output += ", \"startMinute\": " + str(int(start.date.split(" ")[1].split(":")[1]))
output += ", \"range\": " + self.getRange(start,tweets[len(tweets)-1])
output += ", \"step\": \"day\"} , "
output += "\"startPoint\":["
output += "{\"latitude\":" + str(start.position.latitude) + ",\"longitude\":" + str(start.position.longitude) + ",\"text\":\"" + start.text.replace("\"","").replace("\n","") + "\"}]"
differentDays = []
for tweet in tweets:
differentDays.append(self.getDate(tweet.date,1))
for day in set(differentDays):
output += ",\"" + day + "\":["
for i in range(len(tweets)):
if self.getDate(tweets[i].date,1) == day:
output += "{\"latitude\":" + str(tweets[i].position.latitude) + ",\"longitude\":" + str(tweets[i].position.longitude) + ",\"text\":\"" + tweets[i].text.replace("\"","").replace("\n","") + "\"},"
output = output[:-1]
output += "]"
output += "}"
#open("output/"+name+".json", "w").write(output)
return output
def Format2(self,tweets=[]):
""" Return Json-Coordinates formatted for each Hour + Day """
if tweets != []:
start = tweets.pop(0)
output = "{\"setup\":"
output += "{ \"startYear\": " + str(start.date.split(" ")[0].split("-")[0])
output += ", \"startMonth\": " + str(int(start.date.split(" ")[0].split("-")[1]) - 1)
output += ", \"startDay\": " + str(int(start.date.split(" ")[0].split("-")[2]))
output += ", \"startHour\": " + str(int(start.date.split(" ")[1].split(":")[0]))
output += ", \"startMinute\": " + str(int(start.date.split(" ")[1].split(":")[1]))
output += ", \"range\": " + self.getRange(start,tweets[len(tweets)-1])
output += ", \"step\": \"hour\"} , "
output += "\"startPoint\":["
output += "{\"latitude\":" + str(start.position.latitude) + ",\"longitude\":" + str(start.position.longitude) + ",\"text\":\"" + start.text.replace("\"","").replace("\n","") + "\"}]"
differentDays = []
for tweet in tweets:
differentDays.append(self.getDate(tweet.date,2))
for day in set(differentDays):
output += ",\"" + day + "\":["
for i in range(len(tweets)):
if self.getDate(tweets[i].date,2) == day:
output += "{\"latitude\":" + str(tweets[i].position.latitude) + ",\"longitude\":" + str(tweets[i].position.longitude) + ",\"text\":\"" + tweets[i].text.replace("\"","").replace("\n","") + "\"},"
output = output[:-1]
output += "]"
output += "}"
#open("output/"+name+".json", "w").write(output)
return output
def Format3(self,tweets=[]):
""" Return Json-Coordinates formatted for each Minute + Hour + Day """
if tweets != []:
start = tweets.pop(0)
output = "{\"setup\":"
output += "{ \"startYear\": " + str(start.date.split(" ")[0].split("-")[0])
output += ", \"startMonth\": " + str(int(start.date.split(" ")[0].split("-")[1]) - 1)
output += ", \"startDay\": " + str(int(start.date.split(" ")[0].split("-")[2]))
output += ", \"startHour\": " + str(int(start.date.split(" ")[1].split(":")[0]))
output += ", \"startMinute\": " + str(int(start.date.split(" ")[1].split(":")[1])-1)
output += ", \"range\": " + self.getRange(start,tweets[len(tweets)-1])
output += ", \"step\": \"minute\"} , "
output += "\"startPoint\":["
output += "{\"latitude\":" + str(start.position.latitude) + ",\"longitude\":" + str(start.position.longitude) + ",\"text\":\"" + start.text.replace("\"","").replace("\n","") + "\"}]"
differentDays = []
for tweet in tweets:
differentDays.append(self.getDate(tweet.date,3))
for day in set(differentDays):
output += ",\"" + day + "\":["
for i in range(len(tweets)):
if self.getDate(tweets[i].date,3) == day:
output += "{\"latitude\":" + str(tweets[i].position.latitude) + ",\"longitude\":" + str(tweets[i].position.longitude) + ",\"text\":\"" + tweets[i].text.replace("\"","").replace("\n","") + "\"},"
output = output[:-1]
output += "]"
output += "}"
#open("output/"+name+".json", "w").write(output)
return output
def getDate(self,date,mode):
if mode == 1:
d = date.split(" ")
myDate = d[0]
year = myDate.split("-")[0]
month = myDate.split("-")[1]
day = myDate.split("-")[2]
output = "D-00-00-" + day + "-" + month + "-" + year
return output
elif mode == 2:
d = date.split(" ")
myDate = d[0]
year = myDate.split("-")[0]
month = myDate.split("-")[1]
day = myDate.split("-")[2]
time = d[1]
hours = time.split(":")[0]
output = "D-00-" + hours + "-" + day + "-" + month + "-" + year
return output
elif mode == 3:
d = date.split(" ")
myDate = d[0]
year = myDate.split("-")[0]
month = myDate.split("-")[1]
day = myDate.split("-")[2]
time = d[1]
hours = time.split(":")[0]
minutes = time.split(":")[1]
output = "D-" + minutes + "-" + hours + "-" + day + "-" + month + "-" + year
return output
def getRange(self,start,end):
s_hou = int(start.date.split(" ")[1].split(":")[0])
s_min = int(start.date.split(" ")[1].split(":")[1])
s_day = int(start.date.split(" ")[0].split("-")[2])
s_mon = int(start.date.split(" ")[0].split("-")[1])
s_yea = int(start.date.split(" ")[0].split("-")[0])
s = datetime.datetime(s_yea,s_mon,s_day,s_hou,s_min)
e_hou = int(end.date.split(" ")[1].split(":")[0])
e_min = int(end.date.split(" ")[1].split(":")[1])
e_day = int(end.date.split(" ")[0].split("-")[2])
e_mon = int(end.date.split(" ")[0].split("-")[1])
e_yea = int(end.date.split(" ")[0].split("-")[0])
e = datetime.datetime(e_yea,e_mon,e_day,e_hou,e_min)
output = str((((e - s).total_seconds()/60)/60/24))
return output
import tweepy
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
import json
import sqlite3 as lite
from lib import Tweet
# enter twitter-credentials here
consumer_key = '' # App-Key
consumer_secret = '' # App-Secret
access_token = '' # User-token
access_secret = '' # User-Secret
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
# overwrite listener
class MyListener(StreamListener):
# overwrite how data is handled after recieve
def on_data(self, data):
tweet = json.loads(data)
# if tweet is geo-tagged
try:
if tweet['geo'] != None:
t_id = tweet['id']
t_date = convertDate(tweet['created_at'])
t_latitude = tweet['geo']['coordinates'][0]
t_longitude = tweet['geo']['coordinates'][1]
t_text = tweet['text']
t = Tweet.Data(t_id,t_date,t_latitude,t_longitude,[],t_text)
try:
db = lite.connect('database/streamDB.sqlite')
cursor = db.cursor()
cursor.execute("INSERT INTO tweets (id,date,latitude,longitude,text) VALUES (?,?,?,?,?)",(t.id,t.date,t.position.latitude,t.position.longitude,t.text))
print "Tweet added at %s" % (t_date)
except lite.Error, e:
print "Error %s:" % e.args[0]
sys.exit(1)
finally:
if db:
db.commit()
db.close()
elif tweet['place'] != None:
t_id = tweet['id']
t_date = convertDate(tweet['created_at'])
t_latitude = ((tweet['place']['bounding_box']['coordinates'][0][0][1] + \
tweet['place']['bounding_box']['coordinates'][0][2][1])/2)
t_longitude = ((tweet['place']['bounding_box']['coordinates'][0][0][0] + \
tweet['place']['bounding_box']['coordinates'][0][2][0])/2)
t_text = tweet['text']
t = Tweet.Data(t_id,t_date,t_latitude,t_longitude,[],t_text)
try:
db = lite.connect('database/streamDB.sqlite')
cursor = db.cursor()
cursor.execute("INSERT INTO tweets (id,date,latitude,longitude,text) VALUES (?,?,?,?,?)",(t.id,t.date,t.position.latitude,t.position.longitude,t.text))
print "Tweet added at %s" % (t_date)
except lite.Error, e:
print "Error %s:" % e.args[0]
sys.exit(1)
finally:
if db:
db.commit()
db.close()
except BaseException as e:
'on error just skip tweet'
return True
def on_error(self, status):
print(status)
return True
def convertDate(date):
""" Format date to: YYYY-MM-DD HH:MM:SS """
d = date.split(" ")
output = d[5] + "-"
month = d[1]
if month == 'Jan':
output += "01"
elif month == 'Feb':
output += "02"
elif month == 'Mar':
output += "03"
elif month == 'Apr':
output += "04"
elif month == 'May':
output += "05"
elif month == 'Jun':
output += "06"
elif month == 'Jul':
output += "07"
elif month == 'Aug':
output += "08"
elif month == 'Sep':
output += "09"
elif month == 'Oct':
output += "10"
elif month == 'Nov':
output += "11"
elif month == 'Dec':
output += "12"
output += "-" + d[2] + " " + d[3]
return output
def main():
# load streaming API with credentials
twitter_stream = Stream(auth, MyListener())
# set filter
twitter_stream.filter(track=['#Brexit']) # Choose Hashtag
main()
$ python StreamingCrawler.py