-
Notifications
You must be signed in to change notification settings - Fork 1
/
twitter.py
116 lines (94 loc) · 4.29 KB
/
twitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# Import the libraries
from tweepy import Stream
from tweepy import OAuthHandler
#from tweepy.streaming import StreamListener
import tweepy
import json
import pandas as pd
import csv
import re # regular expression
import string
import preprocessor as p
import os
import time
import datetime
# Authenticating Twitter API
# Obtain your Twitter credentials from your twitter developer account
consumer_key = ''
consumer_secret = ''
access_key = '-'
access_secret = ''
# Pass your twitter credentials to tweepy via its OAuthHandler
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
## Automating Scraping
# Calls API every 15 minutes to prevent overcalling
# 1. define a for-loop
# 2. define search parameter
# 3. define date period
# 4. define no. of tweets to pull
def scraptweets(search_words, date_since, numTweets, numRuns):
## Arguments:
# search_words -> define a string of keywords for this function to extract
# date_since -> define a date from which to start extracting the tweets
# numTweets -> number of tweets to extract per run
# numRun -> number of runs to perform in this program - API calls are limited to once every 15 mins, so each run will be 15 mins apart.
##
# Define a pandas dataframe to store the date:
#db_tweets = pd.DataFrame(columns=['Text'])
db_tweets = pd.DataFrame(columns=['Text', 'Username', 'Created', 'Retweeted', 'URL'])
# Define a for-loop to generate tweets at regular intervals
for i in range(numRuns):
# We will time how long it takes to scrape tweets for each run:
start_run = time.time()
# Collect tweets using the Cursor object
# .Cursor() returns an object that you can iterate or loop over to access the data collected.
# Each item in the iterator has various attributes that you can access to get information about each tweet
tweets = tweepy.Cursor(api.search, q=search_words, lang="en",
since=date_since, tweet_mode='extended').items(numTweets)
# Store these tweets into a python list
tweet_list = [tweet for tweet in tweets]
# Obtain the following info (methods to call them out):
# user.screen_name - twitter handle
# user.description - description of account
# user.location - where is he tweeting from
# user.friends_count - no. of other users that user is following (following)
# user.followers_count - no. of other users who are following this user (followers)
# user.statuses_count - total tweets by user
# user.created_at - when the user account was created
# created_at - when the tweet was created
# retweet_count - no. of retweets
# (deprecated) user.favourites_count - probably total no. of tweets that is favourited by user
# retweeted_status.full_text - full text of the tweet
# tweet.entities['hashtags'] - hashtags in the tweet
# Begin scraping the tweets individually:
noTweets = 0
for tweet in tweet_list:
# Pull the values
username = tweet.user.screen_name
#acctdesc = tweet.user.description
#location = tweet.user.location
#following = tweet.user.friends_count
#followers = tweet.user.followers_count
#totaltweets = tweet.user.statuses_count
#usercreatedts = tweet.user.created_at
tweetcreatedts = tweet.created_at
retweetcount = tweet.retweet_count
#hashtags = tweet.entities['hashtags']
try:
text = tweet.retweeted_status.full_text
url_url = tweet.retweeted_status.extended_entities['media'][0]
url = url_url['url']
except AttributeError: # Not a Retweet
text = tweet.full_text
url = pd.NA
#text = tweet.full_text
# Add the 11 variables to the empty list - ith_tweet:
#ith_tweet = [text]
ith_tweet = [text, username, tweetcreatedts, retweetcount, url]
# Append to dataframe - db_tweets
db_tweets.loc[len(db_tweets)] = ith_tweet
db_tweets = db_tweets.drop_duplicates(subset=['Text'])
db_tweets = db_tweets.fillna('NO URL')
return db_tweets