Added pairRdd/sort/*.py

jleetutorial · jleetutorial · Oct 2, 2017 · Oct 2, 2017 · Oct 2, 2017 · Oct 2, 2017
commit 4f0a0147ff6234b420a3a4d447a7c068fa428fb7
diff --git a/pairRdd/sort/AverageHousePriceSolution.py b/pairRdd/sort/AverageHousePriceSolution.py
@@ -0,0 +1,23 @@
+from pairRdd.aggregation.reducebykey.housePrice.AvgCount import AvgCount
+from pyspark import SparkContext
+
+
+if __name__ == "__main__":
+
+    sc = SparkContext("local", "averageHousePriceSolution")
+    sc.setLogLevel("ERROR")
+
+    lines = sc.textFile("in/RealEstate.csv")
+    cleanedLines = lines.filter(lambda line: "Bedrooms" not in line)
+    housePricePairRdd = cleanedLines.map(lambda line: \
+    ((int(float(line.split(",")[3]))), AvgCount(1, float(line.split(",")[2]))))
+
+    housePriceTotal = housePricePairRdd.reduceByKey(lambda x, y: \
+        AvgCount(x.count + y.count, x.total + y.total))
+
+    housePriceAvg = housePriceTotal.mapValues(lambda avgCount: avgCount.total / avgCount.count)
+
+    sortedHousePriceAvg = housePriceAvg.sortByKey()
+
+    for bedrooms, avgPrice in sortedHousePriceAvg.collect():
+        print("{} : {}".format(bedrooms, avgPrice))
diff --git a/pairRdd/sort/SortedWordCountProblem.py b/pairRdd/sort/SortedWordCountProblem.py
@@ -0,0 +1,16 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    '''
+    Create a Spark program to read the an article from in/word_count.text,
+    output the number of occurrence of each word in descending order.
+
+    Sample output:
+
+    apple : 200
+    shoes : 193
+    bag : 176
+    ...
+
+    '''
diff --git a/pairRdd/sort/SortedWordCountSolution.py b/pairRdd/sort/SortedWordCountSolution.py
@@ -0,0 +1,20 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    sc = SparkContext("local", "wordCounts")
+    sc.setLogLevel("ERROR")
+    lines = sc.textFile("in/word_count.text")
+    wordRdd = lines.flatMap(lambda line: line.split(" "))
+
+    wordPairRdd = wordRdd.map(lambda word: (word, 1))
+    wordToCountPairs = wordPairRdd.reduceByKey(lambda x, y: x + y)
+
+    countToWordParis = wordToCountPairs.map(lambda wordToCount: (wordToCount[1], wordToCount[0]))
+
+    sortedCountToWordParis = countToWordParis.sortByKey(ascending=False)
+
+    sortedWordToCountPairs = sortedCountToWordParis.map(lambda countToWord: (countToWord[1], countToWord[0]))
+
+    for word, count in  sortedWordToCountPairs.collect():
+        print("{} : {}".format(word, count))