-
Notifications
You must be signed in to change notification settings - Fork 0
/
twitter_app.py
64 lines (54 loc) · 2.56 KB
/
twitter_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import socket
import sys
import requests
import requests_oauthlib
import json
ACCESS_TOKEN = '#'
ACCESS_SECRET = '#'
CONSUMER_KEY = '#'
CONSUMER_SECRET = '#'
my_auth = requests_oauthlib.OAuth1(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_SECRET)
def get_tweets():
#this is twitter live-streaming url so here we get data continuesly and it will never end
#in case of normal streaming (like getting image from server) it will end when whole object came
url = 'https://stream.twitter.com/1.1/statuses/filter.json'
query_data = [('language', 'en'), ('locations', '-130,-20,100,50'), ('track', '#')]
query_url = url + '?' + '&'.join([str(t[0]) + '='+str(t[1]) for t in query_data])
#getting data from request streams
response = requests.get(query_url, auth=my_auth, stream=True)
print(query_url, response)
return response
#fecthoing tweets from above response
def send_tweets_to_spark(http_resp, tcp_connection):
# getting data line by line from server( this is happening because request stream is true so we can get
# chunk of data from server one by one until we get all data of response )
for line in http_resp.iter_lines():
try:
#loading data from json as python object
full_tweet = json.loads(line)
tweet_text = full_tweet['text']
print("-----------------------------------")
''' sending data, coming line by line from twitter response stream to spark using tcp socket (tcp socket is
used for spark streaming ). here spark creates a batches (a batch is in form of rdd internally) on the
basis of time interval defined by us.
for eg if we set time interval 2 sec then it will create a batch for data which came in 2 sec
and create a another batch for data which came in next 2 sec to spark and then that batches goes to
spark engine one by one'''
''' Spark Streaming provides a way to consume a continuous data stream'''
'''We can only have one StreamingContext per JVM.'''
tcp_connection.send(bytes(((tweet_text)+ '\n'),'utf-8'))
except Exception as e:
print("Error %s" %e)
# configuring a spark stream on localhost:9009 using TCP socket
TCP_IP = "localhost"
TCP_PORT = 9019
conn = None
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind((TCP_IP, TCP_PORT))
s.listen(1)
print("Waiting for tcp connection... ")
conn, addr = s.accept()
print("this is addr", addr)
print("Connected ... Starting getting tweets.")
resp = get_tweets()
send_tweets_to_spark(resp, conn)