Getting the WWDC Data

Tweepy is the python library for interacting with Twitter's api. This program connects to Twitter and stores tweets with occurances of "WWDC", "WWDC16", or "WWDC2016" in a MongoDB database. I started running this program 15 minutes before WWDC started on Monday, June 13 and I stopped running the program 15 minutes after WWDC ended.


import tweepy
import sys
import pymongo


consumer_key=""
consumer_secret=""


access_token=""
access_token_secret=""

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)



class CustomStreamListener(tweepy.StreamListener):
    def __init__(self, api):
        self.api = api
        super(tweepy.StreamListener, self).__init__()

        self.db = pymongo.MongoClient().WWDC_test


    def on_status(self, status):
        print status.text , "\n"

        data ={}
        data['text'] = status.text
        data['hashtags'] = status.entities.get('hashtags')
        data['created_at'] = status.created_at
        data['screen_name'] = status.user.screen_name
        data['followers_count'] = status.user.followers_count
        data['friends_count'] = status.user.friends_count
        data['coordinates'] = status.coordinates
        data['source'] = status.source
      
        self.db.Tweets.insert(data)

    def on_error(self, status_code):
        print >> sys.stderr, 'Encountered error with status code:', status_code
        return True # Don't kill the stream

    def on_timeout(self):
        print >> sys.stderr, 'Timeout...'
        return True # Don't kill the stream

sapi = tweepy.streaming.Stream(auth, CustomStreamListener(api))
sapi.filter(track=['WWDC','WWDC2016','WWDC16'],languages=["en"])

The twitter data is stored in a MongoDB database. I export this data to a json file for safe keeping.


import os
import pymongo


os.system("mongoexport --host localhost --db WWDC_test --collection=Tweets --out ~/Desktop/wwdc_test.json")

That's it for getting the data. Tweepy makes things pretty easy. Next we will start cleaning and analyzing the data.