Tweepy is the python library for interacting with Twitter's api. This program connects to Twitter and stores tweets with occurances of "WWDC", "WWDC16", or "WWDC2016" in a MongoDB database. I started running this program 15 minutes before WWDC started on Monday, June 13 and I stopped running the program 15 minutes after WWDC ended.
import tweepy
import sys
import pymongo
consumer_key=""
consumer_secret=""
access_token=""
access_token_secret=""
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
class CustomStreamListener(tweepy.StreamListener):
def __init__(self, api):
self.api = api
super(tweepy.StreamListener, self).__init__()
self.db = pymongo.MongoClient().WWDC_test
def on_status(self, status):
print status.text , "\n"
data ={}
data['text'] = status.text
data['hashtags'] = status.entities.get('hashtags')
data['created_at'] = status.created_at
data['screen_name'] = status.user.screen_name
data['followers_count'] = status.user.followers_count
data['friends_count'] = status.user.friends_count
data['coordinates'] = status.coordinates
data['source'] = status.source
self.db.Tweets.insert(data)
def on_error(self, status_code):
print >> sys.stderr, 'Encountered error with status code:', status_code
return True # Don't kill the stream
def on_timeout(self):
print >> sys.stderr, 'Timeout...'
return True # Don't kill the stream
sapi = tweepy.streaming.Stream(auth, CustomStreamListener(api))
sapi.filter(track=['WWDC','WWDC2016','WWDC16'],languages=["en"])
The twitter data is stored in a MongoDB database. I export this data to a json file for safe keeping.
import os
import pymongo
os.system("mongoexport --host localhost --db WWDC_test --collection=Tweets --out ~/Desktop/wwdc_test.json")
That's it for getting the data. Tweepy makes things pretty easy. Next we will start cleaning and analyzing the data.