Skip to content

Commit 7bcccc9

Browse files
author
Pedro Bernardo
committed
Added pairRdd/aggregation/combinebykey/AverageHousePriceSolution.py
1 parent c6a3a82 commit 7bcccc9

File tree

1 file changed

+21
-0
lines changed

1 file changed

+21
-0
lines changed
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from pyspark import SparkContext
2+
3+
if __name__ == "__main__":
4+
5+
sc = SparkContext("local", "AverageHousePrice")
6+
sc.setLogLevel("ERROR")
7+
8+
lines = sc.textFile("in/RealEstate.csv")
9+
cleanedLines = lines.filter(lambda line: "Bedrooms" not in line)
10+
11+
housePricePairRdd = cleanedLines.map(lambda line: (line.split(",")[3], float(line.split(",")[2])))
12+
13+
createCombiner = lambda x: (1, x)
14+
mergeValue = lambda avgCount, x: (avgCount[0] + 1, avgCount[1] + x)
15+
mergeCombiners = lambda avgCountA, avgCountB: (avgCountA[0] + avgCountB[0], avgCountA[1] + avgCountB[1])
16+
17+
housePriceTotal = housePricePairRdd.combineByKey(createCombiner, mergeValue, mergeCombiners)
18+
19+
housePriceAvg = housePriceTotal.mapValues(lambda avgCount: avgCount[1] / avgCount[0])
20+
for bedrooms, avgPrice in housePriceAvg.collect():
21+
print("{} : {}".format(bedrooms, avgPrice))

0 commit comments

Comments
 (0)