LinShanify
diff --git a/‎commons/Utils.py
Lines changed: 2 additions & 2 deletions b/‎commons/Utils.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎commons/Utils.scala
Lines changed: 0 additions & 6 deletions b/‎commons/Utils.scala
Lines changed: 0 additions & 6 deletions
diff --git a/‎sparkSql/HousePriceProblem.py
Lines changed: 38 additions & 0 deletions b/‎sparkSql/HousePriceProblem.py
Lines changed: 38 additions & 0 deletions
diff --git a/‎sparkSql/HousePriceProblem.scala
Lines changed: 0 additions & 40 deletions b/‎sparkSql/HousePriceProblem.scala
Lines changed: 0 additions & 40 deletions
diff --git a/‎sparkSql/HousePriceSolution.py
Lines changed: 17 additions & 0 deletions b/‎sparkSql/HousePriceSolution.py
Lines changed: 17 additions & 0 deletions
diff --git a/‎sparkSql/HousePriceSolution.scala
Lines changed: 0 additions & 25 deletions b/‎sparkSql/HousePriceSolution.scala
Lines changed: 0 additions & 25 deletions
diff --git a/‎sparkSql/RddDataframeConversion.py
Lines changed: 39 additions & 0 deletions b/‎sparkSql/RddDataframeConversion.py
Lines changed: 39 additions & 0 deletions
diff --git a/‎sparkSql/RddDatasetConversion.scala
Lines changed: 0 additions & 42 deletions b/‎sparkSql/RddDatasetConversion.scala
Lines changed: 0 additions & 42 deletions
diff --git a/‎sparkSql/Response.scala
Lines changed: 0 additions & 3 deletions b/‎sparkSql/Response.scala
Lines changed: 0 additions & 3 deletions
diff --git a/‎sparkSql/StackOverFlowSurvey.py
Lines changed: 52 additions & 0 deletions b/‎sparkSql/StackOverFlowSurvey.py
Lines changed: 52 additions & 0 deletions
diff --git a/‎sparkSql/StackOverFlowSurvey.scala
Lines changed: 0 additions & 60 deletions b/‎sparkSql/StackOverFlowSurvey.scala
Lines changed: 0 additions & 60 deletions
diff --git a/‎sparkSql/TypedDataset.scala
Lines changed: 0 additions & 55 deletions b/‎sparkSql/TypedDataset.scala
Lines changed: 0 additions & 55 deletions
@@ -1,5 +1,5 @@
 import re
 
 class Utils():
-    
-    COMMA_DELIMITER = re.compile(''',(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''')
+
+    COMMA_DELIMITER = re.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''')
@@ -0,0 +1,38 @@
+if __name__ == "__main__":
+    
+    '''    
+    Create a Spark program to read the house data from in/RealEstate.csv,
+    group by location, aggregate the average price per SQ Ft and sort by average price per SQ Ft.
+
+    The houses dataset contains a collection of recent real estate listings in San Luis Obispo county and
+    around it. 
+
+    The dataset contains the following fields:
+    1. MLS: Multiple listing service number for the house (unique ID).
+    2. Location: city/town where the house is located. Most locations are in San Luis Obispo county and
+    northern Santa Barbara county (Santa MariaOrcutt, Lompoc, Guadelupe, Los Alamos), but there
+    some out of area locations as well.
+    3. Price: the most recent listing price of the house (in dollars).
+    4. Bedrooms: number of bedrooms.
+    5. Bathrooms: number of bathrooms.
+    6. Size: size of the house in square feet.
+    7. Price/SQ.ft: price of the house per square foot.
+    8. Status: type of sale. Thee types are represented in the dataset: Short Sale, Foreclosure and Regular.
+
+    Each field is comma separated.
+
+    Sample output:
+
+    +----------------+-----------------+
+    |        Location| avg(Price SQ Ft)|
+    +----------------+-----------------+
+    |          Oceano|             95.0|
+    |         Bradley|            206.0|
+    | San Luis Obispo|            359.0|
+    |      Santa Ynez|            491.4|
+    |         Cayucos|            887.0|
+    |................|.................|
+    |................|.................|
+    |................|.................|
+    '''
+
@@ -0,0 +1,17 @@
+from pyspark.sql import SparkSession
+
+PRICE_SQ_FT = "Price SQ Ft"
+
+if __name__ == "__main__":
+
+    session = SparkSession.builder.appName("HousePriceSolution").master("local").getOrCreate()
+    session.sparkContext.setLogLevel("ERROR")
+    realEstate = session.read \
+        .option("header","true") \
+        .option("inferSchema", value=True) \
+        .csv("in/RealEstate.csv")
+
+    realEstate.groupBy("Location") \
+        .avg(PRICE_SQ_FT) \
+        .orderBy("avg(Price SQ FT)") \
+        .show()
@@ -0,0 +1,39 @@
+from pyspark.sql import SparkSession
+from commons.Utils import Utils
+
+def getColNames(line: str):
+    splits = Utils.COMMA_DELIMITER.split(line)
+    return [splits[2], splits[6], splits[9], splits[14]]
+
+def mapResponseRdd(line: str):
+    splits = Utils.COMMA_DELIMITER.split(line)
+    double1 = None if not splits[6] else float(splits[6])
+    double2 = None if not splits[14] else float(splits[14])
+    return splits[2], double1, splits[9], double2
+
+if __name__ == "__main__":
+
+    session = SparkSession.builder.appName("StackOverFlowSurvey").master("local").getOrCreate()
+    sc = session.sparkContext
+    sc.setLogLevel("ERROR")
+
+    lines = sc.textFile("in/2016-stack-overflow-survey-responses.csv")
+
+    colNames = lines \
+        .filter(lambda line: Utils.COMMA_DELIMITER.split(line)[2] == "country") \
+        .map(getColNames)
+
+    responseRDD = lines \
+        .filter(lambda line: not Utils.COMMA_DELIMITER.split(line)[2] == "country") \
+        .map(mapResponseRdd)    
+
+    responseDataFrame = responseRDD.toDF(colNames.collect()[0])
+
+    print("=== Print out schema ===")
+    responseDataFrame.printSchema()
+
+    print("=== Print 20 records of responses table ===")
+    responseDataFrame.show(20)
+
+    for response in responseDataFrame.rdd.collect():
+        print(response)
@@ -0,0 +1,52 @@
+from pyspark.sql import SparkSession
+
+AGE_MIDPOINT = "age_midpoint"
+SALARY_MIDPOINT = "salary_midpoint"
+SALARY_MIDPOINT_BUCKET = "salary_midpoint_bucket"
+
+if __name__ == "__main__":
+
+    session = SparkSession.builder.appName("StackOverFlowSurvey").master("local").getOrCreate()
+    session.sparkContext.setLogLevel("ERROR")
+    dataFrameReader = session.read
+
+    responses = dataFrameReader \
+        .option("header", "true") \
+        .option("inferSchema", value = True) \
+        .csv("in/2016-stack-overflow-survey-responses.csv")
+    
+    print("=== Print out schema ===")
+    responses.printSchema()
+    
+    responseWithSelectedColumns = responses.select("country", "occupation", AGE_MIDPOINT, SALARY_MIDPOINT)
+
+    print("=== Print the selected columns of the table ===")
+    responseWithSelectedColumns.show()
+
+    print("=== Print records where the response is from Afghanistan ===")
+    responseWithSelectedColumns.filter(responseWithSelectedColumns["country"] == "Afghanistan").show()
+
+    print("=== Print the count of occupations ===")
+    groupedDataset = responseWithSelectedColumns.groupBy("occupation")
+    groupedDataset.count().show()
+
+    print("=== Print records with average mid age less than 20 ===")
+    responseWithSelectedColumns.filter(responseWithSelectedColumns[AGE_MIDPOINT] < 20).show()
+
+    print("=== Print the result by salary middle point in descending order ===")
+    responseWithSelectedColumns.orderBy(responseWithSelectedColumns[SALARY_MIDPOINT], ascending=False).show()
+
+    print("=== Group by country and aggregate by average salary middle point ===")
+    datasetGroupByCountry = responseWithSelectedColumns.groupBy("country")
+    datasetGroupByCountry.avg(SALARY_MIDPOINT).show()
+
+    responseWithSalaryBucket = responses.withColumn(SALARY_MIDPOINT_BUCKET,
+        ((responses[SALARY_MIDPOINT]/20000).cast("integer")*20000))
+
+    print("=== With salary bucket column ===")
+    responseWithSalaryBucket.select(SALARY_MIDPOINT, SALARY_MIDPOINT_BUCKET).show()
+
+    print("=== Group by salary bucket ===")
+    responseWithSalaryBucket.groupBy(SALARY_MIDPOINT_BUCKET).count().orderBy(SALARY_MIDPOINT_BUCKET).show()
+
+    session.stop()