Skip to content

Commit fb3dfcb

Browse files
author
Pedro Bernardo
committed
Added pairRdd/aggregation/reducebykey/housePrice/*.py
1 parent 43f7883 commit fb3dfcb

File tree

3 files changed

+66
-0
lines changed

3 files changed

+66
-0
lines changed
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from pyspark import SparkContext
2+
3+
if __name__ == "__main__":
4+
5+
'''
6+
Create a Spark program to read the house data from in/RealEstate.csv,
7+
output the average price for houses with different number of bedrooms.
8+
9+
The houses dataset contains a collection of recent real estate listings in San Luis Obispo county and
10+
around it. 
11+
12+
The dataset contains the following fields:
13+
1. MLS: Multiple listing service number for the house (unique ID).
14+
2. Location: city/town where the house is located. Most locations are in San Luis Obispo county and
15+
northern Santa Barbara county (Santa Maria­Orcutt, Lompoc, Guadelupe, Los Alamos), but there
16+
some out of area locations as well.
17+
3. Price: the most recent listing price of the house (in dollars).
18+
4. Bedrooms: number of bedrooms.
19+
5. Bathrooms: number of bathrooms.
20+
6. Size: size of the house in square feet.
21+
7. Price/SQ.ft: price of the house per square foot.
22+
8. Status: type of sale. Thee types are represented in the dataset: Short Sale, Foreclosure and Regular.
23+
24+
Each field is comma separated.
25+
26+
Sample output:
27+
28+
(3, 325000)
29+
(1, 266356)
30+
(2, 325000)
31+
...
32+
33+
3, 1 and 2 mean the number of bedrooms. 325000 means the average price of houses with 3 bedrooms is 325000.
34+
35+
'''
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from pyspark import SparkContext
2+
3+
if __name__ == "__main__":
4+
5+
sc = SparkContext("local", "avgHousePrice")
6+
sc.setLogLevel("ERROR")
7+
8+
lines = sc.textFile("in/RealEstate.csv")
9+
cleanedLines = lines.filter(lambda line: "Bedrooms" not in line)
10+
11+
housePricePairRdd = cleanedLines.map(lambda line: \
12+
(line.split(",")[3], (1, float(line.split(",")[2]))))
13+
14+
housePriceTotal = housePricePairRdd \
15+
.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
16+
17+
print("housePriceTotal: ")
18+
for bedroom, total in housePriceTotal.collect():
19+
print("{} : {}".format(bedroom, total))
20+
21+
housePriceAvg = housePriceTotal.mapValues(lambda avgCount: avgCount[1] / avgCount[0])
22+
print("\nhousePriceAvg: ")
23+
for bedroom, avg in housePriceAvg.collect():
24+
print("{} : {}".format(bedroom, avg))
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
class AvgCount():
2+
3+
def __init__(self, count: int, total: float):
4+
self.count = count
5+
self.total = total
6+
7+

0 commit comments

Comments
 (0)