@@ -5,8 +5,8 @@ import org.apache.spark.sql.SparkSession
5
5
6
6
object TypedDataset {
7
7
8
- val AGE_MIDPOINT = " ageMidpoint "
9
- val SALARY_MIDPOINT = " salaryMidPoint "
8
+ val AGE_MIDPOINT = " age_midpoint "
9
+ val SALARY_MIDPOINT = " salary_midpoint "
10
10
val SALARY_MIDPOINT_BUCKET = " salaryMidpointBucket"
11
11
12
12
def main (args : Array [String ]) {
@@ -24,9 +24,9 @@ object TypedDataset {
24
24
25
25
val responseWithRenamedColumns = responseWithSelectedColumns
26
26
.withColumn(" country" , responses.col(" country" ))
27
- .withColumn(AGE_MIDPOINT , responses.col(" age_midpoint " ).cast(" integer" ))
27
+ .withColumn(AGE_MIDPOINT , responses.col(AGE_MIDPOINT ).cast(" integer" ))
28
28
.withColumn(" occupation" , responses.col(" occupation" ))
29
- .withColumn(SALARY_MIDPOINT , responses.col(" salary_midpoint " ).cast(" integer" ))
29
+ .withColumn(SALARY_MIDPOINT , responses.col(SALARY_MIDPOINT ).cast(" integer" ))
30
30
31
31
import session .implicits ._
32
32
val typedDataset = responseWithRenamedColumns.as[Response ]
@@ -44,16 +44,16 @@ object TypedDataset {
44
44
typedDataset.groupBy(typedDataset.col(" occupation" )).count().show()
45
45
46
46
System .out.println(" === Print responses with average mid age less than 20 ===" )
47
- typedDataset.filter(response => response.ageMidPoint .isDefined && response.ageMidPoint .get < 20 ).show()
47
+ typedDataset.filter(response => response.age_midpoint .isDefined && response.age_midpoint .get < 20 ).show()
48
48
49
49
System .out.println(" === Print the result by salary middle point in descending order ===" )
50
50
typedDataset.orderBy(typedDataset.col(SALARY_MIDPOINT ).desc).show()
51
51
52
52
System .out.println(" === Group by country and aggregate by average salary middle point ===" )
53
- typedDataset.filter(response => response.salaryMidPoint .isDefined).groupBy(" country" ).avg(SALARY_MIDPOINT ).show()
53
+ typedDataset.filter(response => response.salary_midpoint .isDefined).groupBy(" country" ).avg(SALARY_MIDPOINT ).show()
54
54
55
55
System .out.println(" === Group by salary bucket ===" )
56
- typedDataset.map(response => response.salaryMidPoint .map(point => Math .round(point / 20000 ) * 20000 ).orElse(None ))
56
+ typedDataset.map(response => response.salary_midpoint .map(point => Math .round(point / 20000 ) * 20000 ).orElse(None ))
57
57
.withColumnRenamed(" value" , SALARY_MIDPOINT_BUCKET )
58
58
.groupBy(SALARY_MIDPOINT_BUCKET )
59
59
.count()
0 commit comments