jleetutorial
diff --git a/‎3_advanced/.ipynb_checkpoints/3_Join Operations Exercise-checkpoint.ipynb
Lines changed: 2 additions & 1 deletion b/‎3_advanced/.ipynb_checkpoints/3_Join Operations Exercise-checkpoint.ipynb
Lines changed: 2 additions & 1 deletion
diff --git a/‎3_advanced/.ipynb_checkpoints/5_updateStateByKey Exercise-checkpoint.ipynb
Lines changed: 1 addition & 1 deletion b/‎3_advanced/.ipynb_checkpoints/5_updateStateByKey Exercise-checkpoint.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎3_advanced/.ipynb_checkpoints/7_Checkpointing Exercise-checkpoint.ipynb
Lines changed: 5 additions & 5 deletions b/‎3_advanced/.ipynb_checkpoints/7_Checkpointing Exercise-checkpoint.ipynb
Lines changed: 5 additions & 5 deletions
diff --git a/‎3_advanced/.ipynb_checkpoints/9_Accumulators Exercise-checkpoint.ipynb
Lines changed: 11 additions & 10 deletions b/‎3_advanced/.ipynb_checkpoints/9_Accumulators Exercise-checkpoint.ipynb
Lines changed: 11 additions & 10 deletions
diff --git a/‎3_advanced/3_Join Operations Exercise - Solution.ipynb
Lines changed: 186 additions & 0 deletions b/‎3_advanced/3_Join Operations Exercise - Solution.ipynb
Lines changed: 186 additions & 0 deletions
diff --git a/‎3_advanced/3_Join Operations Exercise.ipynb
Lines changed: 2 additions & 1 deletion b/‎3_advanced/3_Join Operations Exercise.ipynb
Lines changed: 2 additions & 1 deletion
@@ -130,7 +130,8 @@
    "outputs": [],
    "source": [
     "# Join the streaming RDD and batch RDDs to filter out bad customers.\n",
-    "dst = ds.transform(lambda rdd: rdd.join(customer_rdd)).filter(lambda (customer_id, (customer_data, is_good_customer)): is_good_customer)\n",
+    "\n",
+    "\n",
     "## END OF EXERCISE SECTION ==================================\n",
     "dst.pprint()"
    ]
 
@@ -116,7 +116,7 @@
     "# - Counts how many times each number between 0 and 9 is seen.\n",
     "# - Update the state with the `updateFunction` using updateStateByKey\n",
     "\n",
-    "dst = ds.map(lambda x: int(x) % 10).map(lambda x: (x,1)).updateStateByKey(updateFunction)\n",
+    "\n",
     "\n",
     "##======================= END OF EXERCISE SECTION =======================\n",
     "\n",
 
@@ -133,7 +133,7 @@
    "outputs": [],
    "source": [
     "## TODO: define a checkpoint Directory checkpointDir\n",
-    "checkpointDir = 'checkpoint'\n",
+    "\n",
     "#### END OF THIS EXERCISE SECTION ######    \n",
     "\n",
     "def functionToCreateContext():\n",
@@ -146,14 +146,14 @@
     "    ds.pprint()\n",
     "    ds.count().pprint()\n",
     "    \n",
-    "    # TODO: Set up checkpoint for the checkpoint directory\n",
-    "    ssc.checkpoint(checkpointDir)\n",
+    "    # TODO: Set up checkpoint for the StreamingContext\n",
+    "\n",
     "    #### END OF THIS EXERCISE SECTION ######\n",
     "    return ssc\n",
     "\n",
     "## TODO: Create Checkpoint for ssc with the `getOrCreate()` method for streaming contexts\n",
-    "ssc = StreamingContext.getOrCreate(\n",
-    "    checkpointDir, functionToCreateContext)\n",
+    "\n",
+    "\n",
     "#### END OF THIS EXERCISE SECTION ######"
    ]
   },
 
@@ -57,18 +57,17 @@
     "#Class of functions that will be fed into the accumulator  \n",
     "class DiffAccumulatorParam(AccumulatorParam):\n",
     "    #TODO: Define function that zeroes out values in a dictionary using a loop\n",
-    "    def zero(self, value):  \n",
-    "        dict1={}  \n",
-    "        for i in range(0,len(value)): \n",
-    "            dict1[i]=0  \n",
-    "        return dict1\n",
+    "    def zero(self, value): \n",
+    "        \n",
+    "        \n",
+    "        \n",
     "    #### END OF THIS EXERCISE SECTION ###### \n",
     "    \n",
     "    #TODO: Define function that adds corresponding values in a dictionary using a loop\n",
     "    def addInPlace(self, val1, val2):  \n",
-    "        for i in val1.keys():\n",
-    "            val1[i] += val2[i]  \n",
-    "        return val1\n",
+    "        \n",
+    "        \n",
+    "        \n",
     "    #### END OF THIS EXERCISE SECTION ###### "
    ]
   },
@@ -85,7 +84,8 @@
     "rdd1 = sc.parallelize(d)\n",
     " \n",
     "#TODO: Create an accumulator, `va` that takes in the DiffAccumulatorParam class\n",
-    "va = sc.accumulator(c, DiffAccumulatorParam()) \n",
+    "\n",
+    "\n",
     "#### END OF THIS EXERCISE SECTION ###### "
    ]
   },
@@ -104,7 +104,8 @@
     "rdd1.foreach(diff)   \n",
     "\n",
     "#TODO: print the value of accumulator  \n",
-    "print(va.value)\n",
+    "\n",
+    "\n",
     "#### END OF THIS EXERCISE SECTION ###### "
    ]
   },
 
@@ -0,0 +1,186 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Join Operations Exercise"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Join Operations\n",
+    "\n",
+    "Finally, its worth highlighting how easily you can perform different kinds of joins in Spark Streaming.\n",
+    "\n",
+    "### Stream-stream joins\n",
+    "\n",
+    "Streams can be very easily joined with other streams.\n",
+    "```python\n",
+    "stream1 = ...\n",
+    "stream2 = ...\n",
+    "joinedStream = stream1.join(stream2)\n",
+    "```\n",
+    "Here, in each batch interval, the RDD generated by `stream1` will be joined with the RDD generated by `stream2`. You can also do `leftOuterJoin`, `rightOuterJoin`, `fullOuterJoin`. Furthermore, it is often very useful to do joins over windows of the streams. That is pretty easy as well.\n",
+    "```python\n",
+    "windowedStream1 = stream1.window(20)\n",
+    "windowedStream2 = stream2.window(60)\n",
+    "joinedStream = windowedStream1.join(windowedStream2)\n",
+    "```\n",
+    "\n",
+    "### Stream-dataset joins\n",
+    "\n",
+    "This has already been shown earlier while explain `DStream.transform` operation. Here is yet another example of joining a windowed stream with a dataset.\n",
+    "```python\n",
+    "dataset = ... # some RDD\n",
+    "windowedStream = stream.window(20)\n",
+    "joinedStream = windowedStream.transform(lambda rdd: rdd.join(dataset))\n",
+    "```\n",
+    "In fact, you can also dynamically change the `dataset` you want to join against. The function provided to `transform` is evaluated every batch interval and therefore will use the current dataset that `dataset` reference points to.\n",
+    "\n",
+    "The complete list of DStream transformations is available in the API documentation. For the Python API, see [DStream](https://spark.apache.org/docs/latest/api/python/pyspark.streaming.html#pyspark.streaming.DStream).\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exercise\n",
+    "Create a streaming app that can join the incoming orders with our previous knowledge of whether this customer is good or bad."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import findspark\n",
+    "# TODO: your path will likely not have 'matthew' in it. Change it to reflect your path.\n",
+    "findspark.init('/home/matthew/spark-2.1.0-bin-hadoop2.7')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyspark import SparkContext\n",
+    "from pyspark.streaming import StreamingContext\n",
+    "import time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sc = SparkContext()\n",
+    "ssc = StreamingContext(sc, 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# For testing, create prepopulated QueueStream of streaming customer orders. \n",
+    "transaction_rdd_queue = []\n",
+    "for i in xrange(5): \n",
+    "    transactions = [(customer_id, None) for customer_id in xrange(10)]\n",
+    "    transaction_rdd = ssc.sparkContext.parallelize(transactions)\n",
+    "    transaction_rdd_queue.append(transaction_rdd)\n",
+    "transaction_rdd_queue.pprint()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Batch RDD of whether customers are good or bad. \n",
+    "# (customer_id, is_good_customer)\n",
+    "customers = [(0,True),(1,False), (2,True), (3,False), (4,True), (5,False), (6,True), (7,False), (8,True), (9,False)]\n",
+    "customer_rdd = ssc.sparkContext.parallelize(customers)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Creating queue stream\n",
+    "ds = ssc.queueStream(transaction_rdd_queue)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Join the streaming RDD and batch RDDs to filter out bad customers.\n",
+    "dst = ds.transform(lambda rdd: rdd.join(customer_rdd)).filter(lambda (customer_id, (customer_data, is_good_customer)): is_good_customer)\n",
+    "## END OF EXERCISE SECTION ==================================\n",
+    "dst.pprint()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ssc.start()\n",
+    "time.sleep(6)\n",
+    "ssc.stop()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Reference\n",
+    "1. https://spark.apache.org/docs/latest/streaming-programming-guide.html#join-operations"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    " "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -130,7 +130,8 @@
    "outputs": [],
    "source": [
     "# Join the streaming RDD and batch RDDs to filter out bad customers.\n",
-    "dst = ds.transform(lambda rdd: rdd.join(customer_rdd)).filter(lambda (customer_id, (customer_data, is_good_customer)): is_good_customer)\n",
+    "\n",
+    "\n",
     "## END OF EXERCISE SECTION ==================================\n",
     "dst.pprint()"
    ]