Modified files for simplicity

matthew-mcateer · matthew-mcateer · commit 65765cfa153a · 2017-11-07T13:52:11.000-05:00
Modified original streaming app to just print Tweets
diff --git a/getting_started_with_spark_streaming/python/spark_app.py b/getting_started_with_spark_streaming/python/spark_app.py
@@ -1,54 +1,24 @@
 from pyspark import SparkConf,SparkContext
-from pyspark.streaming import StreamingContext
+from pyspark.streaming import StreamingContext # From within pyspark or send to spark-submit
 from pyspark.sql import Row,SQLContext
 import sys
 import requests
 
 conf = SparkConf()
-conf.setAppName("TwitterStreamApp")
-sc = SparkContext(conf=conf)
-sc.setLogLevel("ERROR")
-ssc = StreamingContext(sc, 300)
-ssc.checkpoint("checkpoint_TwitterApp")
-dataStream = ssc.socketTextStream("localhost",9009)
-
-
-def aggregate_tags_count(new_values, total_sum):
-    return sum(new_values) + (total_sum or 0)
 
+conf.setAppName("TwitterStreamApp")
 
-def get_sql_context_instance(spark_context):
-    if ('sqlContextSingletonInstance' not in globals()):
-        globals()['sqlContextSingletonInstance'] = SQLContext(spark_context)
-    return globals()['sqlContextSingletonInstance']
-
+sc = SparkContext(conf=conf)
 
-def send_df_to_dashboard(df):
-    top_tags = [str(t.hashtag) for t in df.select("hashtag").collect()]
-    tags_count = [p.hashtag_count for p in df.select("hashtag_count").collect()]
-    url = 'http://localhost:5001/updateData'
-    request_data = {'label': str(top_tags), 'data': str(tags_count)}
-    response = requests.post(url, data=request_data)
+sc.setLogLevel("ERROR")
 
+ssc = StreamingContext(sc, 300) # 5 minute batch interval
 
-def process_rdd(time, rdd):
-    print("----------- %s -----------" % str(time))
-    try:
-        sql_context = get_sql_context_instance(rdd.context)
-        row_rdd = rdd.map(lambda w: Row(hashtag=w[0], hashtag_count=w[1]))
-        hashtags_df = sql_context.createDataFrame(row_rdd)
-        hashtags_df.registerTempTable("hashtags")
-        hashtag_counts_df = sql_context.sql("select hashtag, hashtag_count from hashtags order by hashtag_count desc limit 5")
-        hashtag_counts_df.show()
-        send_df_to_dashboard(hashtag_counts_df)
-    except:
-        e = sys.exc_info()[0]
-        print("Error: %s" % e)
+ssc.checkpoint("checkpoint_TwitterApp")
 
-words = dataStream.flatMap(lambda line: line.split(" "))
-hashtags = words.map(lambda x: (x, 1))
-tags_totals = hashtags.updateStateByKey(aggregate_tags_count)
-tags_totals.foreachRDD(process_rdd)
+dataStream = ssc.socketTextStream("localhost",9009) # Stream IP (localhost), and port (5555 in our case)
 
-ssc.start()
-ssc.awaitTermination()
+dataStream.pprint() # Print the incoming tweets to the console
+﻿
+ssc.start() # Start reading the stream
+ssc.awaitTermination() # Wait for the process to terminate
diff --git a/getting_started_with_spark_streaming/python/twitter_app.py b/getting_started_with_spark_streaming/python/twitter_app.py
@@ -1,15 +1,13 @@
-ACCESS_TOKEN = '2988835149-OgWXTLsBiosW74ZJi563l64WQ8f6tNudQKpLudp'
-ACCESS_SECRET = 'eOpeUX1wqL5sq9UK7yFbOPZ7ydYRZVqJ2Q2W7w3b1si7V'
-CONSUMER_KEY = 'AYYI7CvstplEi3fBAJ24vLVBA'
-CONSUMER_SECRET = 'OGFsmodX5DnHBcmZA3OrwFIeXS0gSUfPXcZVGTUOdffItb5Z0N'
-
-
 import socket
 import sys
 import requests
 import requests_oauthlib
 import json
 
+ACCESS_TOKEN = '2988835149-OgWXTLsBiosW74ZJi563l64WQ8f6tNudQKpLudp'
+ACCESS_SECRET = 'eOpeUX1wqL5sq9UK7yFbOPZ7ydYRZVqJ2Q2W7w3b1si7V'
+CONSUMER_KEY = 'AYYI7CvstplEi3fBAJ24vLVBA'
+CONSUMER_SECRET = 'OGFsmodX5DnHBcmZA3OrwFIeXS0gSUfPXcZVGTUOdffItb5Z0N'
 
 my_auth = requests_oauthlib.OAuth1(CONSUMER_KEY, CONSUMER_SECRET,ACCESS_TOKEN, ACCESS_SECRET)