Skip to content

Commit 807e4d2

Browse files
author
Pedro Bernardo
committed
Added sparkSql/StackOverFlowSurvey.py
1 parent 4b038de commit 807e4d2

File tree

1 file changed

+52
-0
lines changed

1 file changed

+52
-0
lines changed

sparkSql/StackOverFlowSurvey.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from pyspark.sql import SparkSession
2+
3+
AGE_MIDPOINT = "age_midpoint"
4+
SALARY_MIDPOINT = "salary_midpoint"
5+
SALARY_MIDPOINT_BUCKET = "salary_midpoint_bucket"
6+
7+
if __name__ == "__main__":
8+
9+
session = SparkSession.builder.appName("StackOverFlowSurvey").master("local").getOrCreate()
10+
session.sparkContext.setLogLevel("ERROR")
11+
dataFrameReader = session.read
12+
13+
responses = dataFrameReader \
14+
.option("header", "true") \
15+
.option("inferSchema", value = True) \
16+
.csv("in/2016-stack-overflow-survey-responses.csv")
17+
18+
print("=== Print out schema ===")
19+
responses.printSchema()
20+
21+
responseWithSelectedColumns = responses.select("country", "occupation", AGE_MIDPOINT, SALARY_MIDPOINT)
22+
23+
print("=== Print the selected columns of the table ===")
24+
responseWithSelectedColumns.show()
25+
26+
print("=== Print records where the response is from Afghanistan ===")
27+
responseWithSelectedColumns.filter(responseWithSelectedColumns["country"] == "Afghanistan").show()
28+
29+
print("=== Print the count of occupations ===")
30+
groupedDataset = responseWithSelectedColumns.groupBy("occupation")
31+
groupedDataset.count().show()
32+
33+
print("=== Print records with average mid age less than 20 ===")
34+
responseWithSelectedColumns.filter(responseWithSelectedColumns[AGE_MIDPOINT] < 20).show()
35+
36+
print("=== Print the result by salary middle point in descending order ===")
37+
responseWithSelectedColumns.orderBy(responseWithSelectedColumns[SALARY_MIDPOINT], ascending=False).show()
38+
39+
print("=== Group by country and aggregate by average salary middle point ===")
40+
datasetGroupByCountry = responseWithSelectedColumns.groupBy("country")
41+
datasetGroupByCountry.avg(SALARY_MIDPOINT).show()
42+
43+
responseWithSalaryBucket = responses.withColumn(SALARY_MIDPOINT_BUCKET,
44+
((responses[SALARY_MIDPOINT]/20000).cast("integer")*20000))
45+
46+
print("=== With salary bucket column ===")
47+
responseWithSalaryBucket.select(SALARY_MIDPOINT, SALARY_MIDPOINT_BUCKET).show()
48+
49+
print("=== Group by salary bucket ===")
50+
responseWithSalaryBucket.groupBy(SALARY_MIDPOINT_BUCKET).count().orderBy(SALARY_MIDPOINT_BUCKET).show()
51+
52+
session.stop()

0 commit comments

Comments
 (0)