Skip to content

Commit 96f979a

Browse files
author
James Lee
committed
adapt StackOverFlowSurvey
1 parent 9e7ef26 commit 96f979a

File tree

2 files changed

+67
-71
lines changed

2 files changed

+67
-71
lines changed

src/main/scala/com/sparkTutorial/sparkSql/StackOverFlowSurvey.java

Lines changed: 0 additions & 71 deletions
This file was deleted.
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
package com.sparkTutorial.sparkSql
2+
3+
import org.apache.log4j.{Level, Logger}
4+
import org.apache.spark.sql.SparkSession
5+
6+
object StackOverFlowSurvey {
7+
8+
val AGE_MIDPOINT = "age_midpoint"
9+
val SALARY_MIDPOINT = "salary_midpoint"
10+
val SALARY_MIDPOINT_BUCKET = "salary_midpoint_bucket"
11+
12+
def main(args: Array[String]) {
13+
14+
Logger.getLogger("org").setLevel(Level.ERROR)
15+
val session = SparkSession.builder().appName("StackOverFlowSurvey").master("local[1]").getOrCreate()
16+
17+
val dataFrameReader = session.read
18+
19+
val responses = dataFrameReader.option("header", "true").csv("in/2016-stack-overflow-survey-responses.csv")
20+
21+
System.out.println("=== Print out schema ===")
22+
responses.printSchema()
23+
24+
System.out.println("=== Print 20 records of responses table ===")
25+
responses.show(20)
26+
27+
System.out.println("=== Print the so_region and self_identification columns of gender table ===")
28+
responses.select("so_region", "self_identification").show()
29+
30+
System.out.println("=== Print records where the response is from Afghanistan ===")
31+
responses.filter(responses.col("country").===("Afghanistan")).show()
32+
33+
System.out.println("=== Print the count of occupations ===")
34+
val groupedDataset = responses.groupBy("occupation")
35+
groupedDataset.count().show()
36+
37+
System.out.println("=== Cast the salary mid point and age mid point to integer ===")
38+
val castedResponse = responses.withColumn(SALARY_MIDPOINT, responses.col(SALARY_MIDPOINT).cast("integer"))
39+
.withColumn(AGE_MIDPOINT, responses.col(AGE_MIDPOINT).cast("integer"))
40+
41+
System.out.println("=== Print out casted schema ===")
42+
castedResponse.printSchema()
43+
44+
import session.implicits._
45+
System.out.println("=== Print records with average mid age less than 20 ===")
46+
castedResponse.filter($"age_midpoint" < 20).show()
47+
48+
System.out.println("=== Print the result by salary middle point in descending order ===")
49+
castedResponse.orderBy(castedResponse.col(SALARY_MIDPOINT).desc).show()
50+
51+
System.out.println("=== Group by country and aggregate by average salary middle point and max age middle point ===")
52+
val datasetGroupByCountry = castedResponse.groupBy("country")
53+
datasetGroupByCountry.avg(SALARY_MIDPOINT).show()
54+
55+
56+
val responseWithSalaryBucket = castedResponse.withColumn(
57+
SALARY_MIDPOINT_BUCKET, castedResponse.col(SALARY_MIDPOINT).divide(20000).cast("integer").multiply(20000))
58+
59+
System.out.println("=== With salary bucket column ===")
60+
responseWithSalaryBucket.select(SALARY_MIDPOINT, SALARY_MIDPOINT_BUCKET).show()
61+
62+
System.out.println("=== Group by salary bucket ===")
63+
responseWithSalaryBucket.groupBy(SALARY_MIDPOINT_BUCKET).count().orderBy(SALARY_MIDPOINT_BUCKET).show()
64+
65+
session.stop()
66+
}
67+
}

0 commit comments

Comments
 (0)