Skip to content

Commit abb5a1e

Browse files
author
Pedro Bernardo
committed
Added advanced/broadcast/*.py
1 parent 5f9071a commit abb5a1e

File tree

2 files changed

+55
-0
lines changed

2 files changed

+55
-0
lines changed

advanced/broadcast/UkMakerSpaces.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from pyspark import SparkContext
2+
from commons.Utils import Utils
3+
4+
def getPostPrefix(line: str):
5+
splits = Utils.COMMA_DELIMITER.split(line)
6+
postcode = splits[4]
7+
return None if not postcode else postcode.split(" ")[0]
8+
9+
def loadPostCodeMap():
10+
lines = open("in/uk-postcode.csv", "r").read().split("\n")
11+
splitsForLines = [Utils.COMMA_DELIMITER.split(line) for line in lines if line != ""]
12+
return {splits[0]: splits[7] for splits in splitsForLines}
13+
14+
if __name__ == "__main__":
15+
sc = SparkContext("local", "UkMakerSpaces")
16+
sc.setLogLevel("ERROR")
17+
18+
postCodeMap = sc.broadcast(loadPostCodeMap())
19+
20+
makerSpaceRdd = sc.textFile("in/uk-makerspaces-identifiable-data.csv")
21+
22+
regions = makerSpaceRdd \
23+
.filter(lambda line: Utils.COMMA_DELIMITER.split(line)[0] != "Timestamp") \
24+
.filter(lambda line: getPostPrefix(line) is not None) \
25+
.map(lambda line: postCodeMap.value[getPostPrefix(line)] \
26+
if getPostPrefix(line) in postCodeMap.value else "Unknow")
27+
28+
for region, count in regions.countByValue().items():
29+
print("{} : {}".format(region, count))
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from pyspark import SparkContext
2+
from commons.Utils import Utils
3+
4+
def getPostPrefixes(line: str):
5+
postcode = Utils.COMMA_DELIMITER.split(line)[4]
6+
cleanedPostCode = postcode.replace("\\s+", "")
7+
return [cleanedPostCode[0:i] for i in range(0,len(cleanedPostCode)+1)]
8+
9+
def loadPostCodeMap():
10+
lines = open("in/uk-postcode.csv", "r").read().split("\n")
11+
splitsForLines = [Utils.COMMA_DELIMITER.split(line) for line in lines if line != ""]
12+
return {splits[0]: splits[7] for splits in splitsForLines}
13+
14+
if __name__ == "__main__":
15+
sc = SparkContext("local", "UkMakerSpaces")
16+
sc.setLogLevel("ERROR")
17+
postCodeMap = loadPostCodeMap()
18+
makerSpaceRdd = sc.textFile("in/uk-makerspaces-identifiable-data.csv")
19+
20+
regions = makerSpaceRdd \
21+
.filter(lambda line: Utils.COMMA_DELIMITER.split(line)[0] != "Timestamp") \
22+
.map(lambda line: next((postCodeMap[prefix] for prefix in getPostPrefixes(line) \
23+
if prefix in postCodeMap), "Unknow"))
24+
25+
for region, count in regions.countByValue().items():
26+
print("{} : {}".format(region, count))

0 commit comments

Comments
 (0)