jleetutorial · jleetutorial · Sep 30, 2017 · Sep 30, 2017 · Sep 30, 2017 · Sep 30, 2017
diff --git a/commons/Utils.py b/commons/Utils.py
@@ -1,5 +1,5 @@
 import re
 
 class Utils():
-    
-    COMMA_DELIMITER = re.compile(''',(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''')
+
+    COMMA_DELIMITER = re.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''')
diff --git a/commons/Utils.scala b/commons/Utils.scala
diff --git a/sparkSql/HousePriceProblem.py b/sparkSql/HousePriceProblem.py
@@ -0,0 +1,38 @@
+if __name__ == "__main__":
+
+    '''    
+    Create a Spark program to read the house data from in/RealEstate.csv,
+    group by location, aggregate the average price per SQ Ft and sort by average price per SQ Ft.
+
+    The houses dataset contains a collection of recent real estate listings in San Luis Obispo county and
+    around it. 
+
+    The dataset contains the following fields:
+    1. MLS: Multiple listing service number for the house (unique ID).
+    2. Location: city/town where the house is located. Most locations are in San Luis Obispo county and
+    northern Santa Barbara county (Santa MariaOrcutt, Lompoc, Guadelupe, Los Alamos), but there
+    some out of area locations as well.
+    3. Price: the most recent listing price of the house (in dollars).
+    4. Bedrooms: number of bedrooms.
+    5. Bathrooms: number of bathrooms.
+    6. Size: size of the house in square feet.
+    7. Price/SQ.ft: price of the house per square foot.
+    8. Status: type of sale. Thee types are represented in the dataset: Short Sale, Foreclosure and Regular.
+
+    Each field is comma separated.
+
+    Sample output:
+
+    +----------------+-----------------+
+    |        Location| avg(Price SQ Ft)|
+    +----------------+-----------------+
+    |          Oceano|             95.0|
+    |         Bradley|            206.0|
+    | San Luis Obispo|            359.0|
+    |      Santa Ynez|            491.4|
+    |         Cayucos|            887.0|
+    |................|.................|
+    |................|.................|
+    |................|.................|
+    '''
+
diff --git a/sparkSql/HousePriceProblem.scala b/sparkSql/HousePriceProblem.scala
diff --git a/sparkSql/HousePriceSolution.py b/sparkSql/HousePriceSolution.py
@@ -0,0 +1,17 @@
+from pyspark.sql import SparkSession
+
+PRICE_SQ_FT = "Price SQ Ft"
+
+if __name__ == "__main__":
+
+    session = SparkSession.builder.appName("HousePriceSolution").master("local").getOrCreate()
+    session.sparkContext.setLogLevel("ERROR")
+    realEstate = session.read \
+        .option("header","true") \
+        .option("inferSchema", value=True) \
+        .csv("in/RealEstate.csv")
+
+    realEstate.groupBy("Location") \
+        .avg(PRICE_SQ_FT) \
+        .orderBy("avg(Price SQ FT)") \
+        .show()
diff --git a/sparkSql/HousePriceSolution.scala b/sparkSql/HousePriceSolution.scala
diff --git a/sparkSql/RddDataframeConversion.py b/sparkSql/RddDataframeConversion.py
@@ -0,0 +1,39 @@
+from pyspark.sql import SparkSession
+from commons.Utils import Utils
+
+def getColNames(line: str):
+    splits = Utils.COMMA_DELIMITER.split(line)
+    return [splits[2], splits[6], splits[9], splits[14]]
+
+def mapResponseRdd(line: str):
+    splits = Utils.COMMA_DELIMITER.split(line)
+    double1 = None if not splits[6] else float(splits[6])
+    double2 = None if not splits[14] else float(splits[14])
+    return splits[2], double1, splits[9], double2
+
+if __name__ == "__main__":
+
+    session = SparkSession.builder.appName("StackOverFlowSurvey").master("local").getOrCreate()
+    sc = session.sparkContext
+    sc.setLogLevel("ERROR")
+
+    lines = sc.textFile("in/2016-stack-overflow-survey-responses.csv")
+
+    colNames = lines \
+        .filter(lambda line: Utils.COMMA_DELIMITER.split(line)[2] == "country") \
+        .map(getColNames)
+
+    responseRDD = lines \
+        .filter(lambda line: not Utils.COMMA_DELIMITER.split(line)[2] == "country") \
+        .map(mapResponseRdd)    
+
+    responseDataFrame = responseRDD.toDF(colNames.collect()[0])
+
+    print("=== Print out schema ===")
+    responseDataFrame.printSchema()
+
+    print("=== Print 20 records of responses table ===")
+    responseDataFrame.show(20)
+
+    for response in responseDataFrame.rdd.collect():
+        print(response)
diff --git a/sparkSql/RddDatasetConversion.scala b/sparkSql/RddDatasetConversion.scala
diff --git a/sparkSql/Response.scala b/sparkSql/Response.scala
diff --git a/sparkSql/StackOverFlowSurvey.py b/sparkSql/StackOverFlowSurvey.py
@@ -0,0 +1,52 @@
+from pyspark.sql import SparkSession
+
+AGE_MIDPOINT = "age_midpoint"
+SALARY_MIDPOINT = "salary_midpoint"
+SALARY_MIDPOINT_BUCKET = "salary_midpoint_bucket"
+
+if __name__ == "__main__":
+
+    session = SparkSession.builder.appName("StackOverFlowSurvey").master("local").getOrCreate()
+    session.sparkContext.setLogLevel("ERROR")
+    dataFrameReader = session.read
+
+    responses = dataFrameReader \
+        .option("header", "true") \
+        .option("inferSchema", value = True) \
+        .csv("in/2016-stack-overflow-survey-responses.csv")
+
+    print("=== Print out schema ===")
+    responses.printSchema()
+
+    responseWithSelectedColumns = responses.select("country", "occupation", AGE_MIDPOINT, SALARY_MIDPOINT)
+
+    print("=== Print the selected columns of the table ===")
+    responseWithSelectedColumns.show()
+
+    print("=== Print records where the response is from Afghanistan ===")
+    responseWithSelectedColumns.filter(responseWithSelectedColumns["country"] == "Afghanistan").show()
+
+    print("=== Print the count of occupations ===")
+    groupedDataset = responseWithSelectedColumns.groupBy("occupation")
+    groupedDataset.count().show()
+
+    print("=== Print records with average mid age less than 20 ===")
+    responseWithSelectedColumns.filter(responseWithSelectedColumns[AGE_MIDPOINT] < 20).show()
+
+    print("=== Print the result by salary middle point in descending order ===")
+    responseWithSelectedColumns.orderBy(responseWithSelectedColumns[SALARY_MIDPOINT], ascending=False).show()
+
+    print("=== Group by country and aggregate by average salary middle point ===")
+    datasetGroupByCountry = responseWithSelectedColumns.groupBy("country")
+    datasetGroupByCountry.avg(SALARY_MIDPOINT).show()
+
+    responseWithSalaryBucket = responses.withColumn(SALARY_MIDPOINT_BUCKET,
+        ((responses[SALARY_MIDPOINT]/20000).cast("integer")*20000))
+
+    print("=== With salary bucket column ===")
+    responseWithSalaryBucket.select(SALARY_MIDPOINT, SALARY_MIDPOINT_BUCKET).show()
+
+    print("=== Group by salary bucket ===")
+    responseWithSalaryBucket.groupBy(SALARY_MIDPOINT_BUCKET).count().orderBy(SALARY_MIDPOINT_BUCKET).show()
+
+    session.stop()
diff --git a/sparkSql/StackOverFlowSurvey.scala b/sparkSql/StackOverFlowSurvey.scala
diff --git a/sparkSql/TypedDataset.scala b/sparkSql/TypedDataset.scala