Skip to content

Commit 65765cf

Browse files
Modified files for simplicity
Modified original streaming app to just print Tweets
1 parent 52be70f commit 65765cf

File tree

2 files changed

+15
-47
lines changed

2 files changed

+15
-47
lines changed
Lines changed: 11 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,24 @@
11
from pyspark import SparkConf,SparkContext
2-
from pyspark.streaming import StreamingContext
2+
from pyspark.streaming import StreamingContext # From within pyspark or send to spark-submit
33
from pyspark.sql import Row,SQLContext
44
import sys
55
import requests
66

77
conf = SparkConf()
8-
conf.setAppName("TwitterStreamApp")
9-
sc = SparkContext(conf=conf)
10-
sc.setLogLevel("ERROR")
11-
ssc = StreamingContext(sc, 300)
12-
ssc.checkpoint("checkpoint_TwitterApp")
13-
dataStream = ssc.socketTextStream("localhost",9009)
14-
15-
16-
def aggregate_tags_count(new_values, total_sum):
17-
return sum(new_values) + (total_sum or 0)
188

9+
conf.setAppName("TwitterStreamApp")
1910

20-
def get_sql_context_instance(spark_context):
21-
if ('sqlContextSingletonInstance' not in globals()):
22-
globals()['sqlContextSingletonInstance'] = SQLContext(spark_context)
23-
return globals()['sqlContextSingletonInstance']
24-
11+
sc = SparkContext(conf=conf)
2512

26-
def send_df_to_dashboard(df):
27-
top_tags = [str(t.hashtag) for t in df.select("hashtag").collect()]
28-
tags_count = [p.hashtag_count for p in df.select("hashtag_count").collect()]
29-
url = 'http://localhost:5001/updateData'
30-
request_data = {'label': str(top_tags), 'data': str(tags_count)}
31-
response = requests.post(url, data=request_data)
13+
sc.setLogLevel("ERROR")
3214

15+
ssc = StreamingContext(sc, 300) # 5 minute batch interval
3316

34-
def process_rdd(time, rdd):
35-
print("----------- %s -----------" % str(time))
36-
try:
37-
sql_context = get_sql_context_instance(rdd.context)
38-
row_rdd = rdd.map(lambda w: Row(hashtag=w[0], hashtag_count=w[1]))
39-
hashtags_df = sql_context.createDataFrame(row_rdd)
40-
hashtags_df.registerTempTable("hashtags")
41-
hashtag_counts_df = sql_context.sql("select hashtag, hashtag_count from hashtags order by hashtag_count desc limit 5")
42-
hashtag_counts_df.show()
43-
send_df_to_dashboard(hashtag_counts_df)
44-
except:
45-
e = sys.exc_info()[0]
46-
print("Error: %s" % e)
17+
ssc.checkpoint("checkpoint_TwitterApp")
4718

48-
words = dataStream.flatMap(lambda line: line.split(" "))
49-
hashtags = words.map(lambda x: (x, 1))
50-
tags_totals = hashtags.updateStateByKey(aggregate_tags_count)
51-
tags_totals.foreachRDD(process_rdd)
19+
dataStream = ssc.socketTextStream("localhost",9009) # Stream IP (localhost), and port (5555 in our case)
5220

53-
ssc.start()
54-
ssc.awaitTermination()
21+
dataStream.pprint() # Print the incoming tweets to the console
22+

23+
ssc.start() # Start reading the stream
24+
ssc.awaitTermination() # Wait for the process to terminate

getting_started_with_spark_streaming/python/twitter_app.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
1-
ACCESS_TOKEN = '2988835149-OgWXTLsBiosW74ZJi563l64WQ8f6tNudQKpLudp'
2-
ACCESS_SECRET = 'eOpeUX1wqL5sq9UK7yFbOPZ7ydYRZVqJ2Q2W7w3b1si7V'
3-
CONSUMER_KEY = 'AYYI7CvstplEi3fBAJ24vLVBA'
4-
CONSUMER_SECRET = 'OGFsmodX5DnHBcmZA3OrwFIeXS0gSUfPXcZVGTUOdffItb5Z0N'
5-
6-
71
import socket
82
import sys
93
import requests
104
import requests_oauthlib
115
import json
126

7+
ACCESS_TOKEN = '2988835149-OgWXTLsBiosW74ZJi563l64WQ8f6tNudQKpLudp'
8+
ACCESS_SECRET = 'eOpeUX1wqL5sq9UK7yFbOPZ7ydYRZVqJ2Q2W7w3b1si7V'
9+
CONSUMER_KEY = 'AYYI7CvstplEi3fBAJ24vLVBA'
10+
CONSUMER_SECRET = 'OGFsmodX5DnHBcmZA3OrwFIeXS0gSUfPXcZVGTUOdffItb5Z0N'
1311

1412
my_auth = requests_oauthlib.OAuth1(CONSUMER_KEY, CONSUMER_SECRET,ACCESS_TOKEN, ACCESS_SECRET)
1513

0 commit comments

Comments
 (0)