Skip to content

Commit 4f0a014

Browse files
author
Pedro Bernardo
committed
Added pairRdd/sort/*.py
1 parent 35b653c commit 4f0a014

File tree

3 files changed

+59
-0
lines changed

3 files changed

+59
-0
lines changed
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from pairRdd.aggregation.reducebykey.housePrice.AvgCount import AvgCount
2+
from pyspark import SparkContext
3+
4+
5+
if __name__ == "__main__":
6+
7+
sc = SparkContext("local", "averageHousePriceSolution")
8+
sc.setLogLevel("ERROR")
9+
10+
lines = sc.textFile("in/RealEstate.csv")
11+
cleanedLines = lines.filter(lambda line: "Bedrooms" not in line)
12+
housePricePairRdd = cleanedLines.map(lambda line: \
13+
((int(float(line.split(",")[3]))), AvgCount(1, float(line.split(",")[2]))))
14+
15+
housePriceTotal = housePricePairRdd.reduceByKey(lambda x, y: \
16+
AvgCount(x.count + y.count, x.total + y.total))
17+
18+
housePriceAvg = housePriceTotal.mapValues(lambda avgCount: avgCount.total / avgCount.count)
19+
20+
sortedHousePriceAvg = housePriceAvg.sortByKey()
21+
22+
for bedrooms, avgPrice in sortedHousePriceAvg.collect():
23+
print("{} : {}".format(bedrooms, avgPrice))
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from pyspark import SparkContext
2+
3+
if __name__ == "__main__":
4+
5+
'''
6+
Create a Spark program to read the an article from in/word_count.text,
7+
output the number of occurrence of each word in descending order.
8+
9+
Sample output:
10+
11+
apple : 200
12+
shoes : 193
13+
bag : 176
14+
...
15+
16+
'''
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from pyspark import SparkContext
2+
3+
if __name__ == "__main__":
4+
5+
sc = SparkContext("local", "wordCounts")
6+
sc.setLogLevel("ERROR")
7+
lines = sc.textFile("in/word_count.text")
8+
wordRdd = lines.flatMap(lambda line: line.split(" "))
9+
10+
wordPairRdd = wordRdd.map(lambda word: (word, 1))
11+
wordToCountPairs = wordPairRdd.reduceByKey(lambda x, y: x + y)
12+
13+
countToWordParis = wordToCountPairs.map(lambda wordToCount: (wordToCount[1], wordToCount[0]))
14+
15+
sortedCountToWordParis = countToWordParis.sortByKey(ascending=False)
16+
17+
sortedWordToCountPairs = sortedCountToWordParis.map(lambda countToWord: (countToWord[1], countToWord[0]))
18+
19+
for word, count in sortedWordToCountPairs.collect():
20+
print("{} : {}".format(word, count))

0 commit comments

Comments
 (0)