Added pairRdd/aggregation/reducebykey/housePrice/*.py

jleetutorial · jleetutorial · Oct 2, 2017 · Oct 2, 2017 · Oct 2, 2017 · Oct 2, 2017
commit fb3dfcbf7b00b5e7026c9b8aa8fd997d92d79d58
diff --git a/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceProblem.py b/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceProblem.py
@@ -0,0 +1,35 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    '''
+    Create a Spark program to read the house data from in/RealEstate.csv,
+    output the average price for houses with different number of bedrooms.
+
+    The houses dataset contains a collection of recent real estate listings in San Luis Obispo county and
+    around it. 
+
+    The dataset contains the following fields:
+    1. MLS: Multiple listing service number for the house (unique ID).
+    2. Location: city/town where the house is located. Most locations are in San Luis Obispo county and
+    northern Santa Barbara county (Santa MariaOrcutt, Lompoc, Guadelupe, Los Alamos), but there
+    some out of area locations as well.
+    3. Price: the most recent listing price of the house (in dollars).
+    4. Bedrooms: number of bedrooms.
+    5. Bathrooms: number of bathrooms.
+    6. Size: size of the house in square feet.
+    7. Price/SQ.ft: price of the house per square foot.
+    8. Status: type of sale. Thee types are represented in the dataset: Short Sale, Foreclosure and Regular.
+
+    Each field is comma separated.
+
+    Sample output:
+
+       (3, 325000)
+       (1, 266356)
+       (2, 325000)
+       ...
+
+    3, 1 and 2 mean the number of bedrooms. 325000 means the average price of houses with 3 bedrooms is 325000.
+
+    '''
diff --git a/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.py b/pairRdd/aggregation/reducebykey/housePrice/AverageHousePriceSolution.py
@@ -0,0 +1,24 @@
+from pyspark import SparkContext
+
+if __name__ == "__main__":
+
+    sc = SparkContext("local", "avgHousePrice")
+    sc.setLogLevel("ERROR")
+
+    lines = sc.textFile("in/RealEstate.csv")
+    cleanedLines = lines.filter(lambda line: "Bedrooms" not in line)
+
+    housePricePairRdd = cleanedLines.map(lambda line: \
+        (line.split(",")[3], (1, float(line.split(",")[2]))))
+
+    housePriceTotal = housePricePairRdd \
+        .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
+
+    print("housePriceTotal: ")
+    for bedroom, total in housePriceTotal.collect():
+        print("{} : {}".format(bedroom, total))
+
+    housePriceAvg = housePriceTotal.mapValues(lambda avgCount: avgCount[1] / avgCount[0])
+    print("\nhousePriceAvg: ")
+    for bedroom, avg in housePriceAvg.collect():
+        print("{} : {}".format(bedroom, avg))
diff --git a/pairRdd/aggregation/reducebykey/housePrice/AvgCount.py b/pairRdd/aggregation/reducebykey/housePrice/AvgCount.py
@@ -0,0 +1,7 @@
+class AvgCount():
+
+    def __init__(self, count: int, total: float):
+        self.count = count
+        self.total = total
+
+