From f4123cb6bcbd044ab930a24cc3530e9e0a5960e0 Mon Sep 17 00:00:00 2001 From: Jonathan Rioux Date: Wed, 11 Jun 2025 09:46:39 -0700 Subject: [PATCH] Databricks Free available --- DownloadsDatabricksFree.py | 140 +++++++++++++++++++++++++++++++++++++ README.md | 8 ++- 2 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 DownloadsDatabricksFree.py diff --git a/DownloadsDatabricksFree.py b/DownloadsDatabricksFree.py new file mode 100644 index 0000000..957ea6a --- /dev/null +++ b/DownloadsDatabricksFree.py @@ -0,0 +1,140 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC # Getting ready for the book's examples using Databricks Free. +# MAGIC +# MAGIC If you want to get ready ASAP, you can run this notebook end-to-end. The data will be available in volumes and tables under `dapp`. I recommend for you to read carefully through the notebook and use the `Catalog` link aside to understand what to change when reading from a Databricks volume/table. +# MAGIC +# MAGIC ## Summary of changes +# MAGIC +# MAGIC You don't have to set the `spark` variable. +# MAGIC +# MAGIC # Chapter 2 and 3 (provided as an example) +# MAGIC +# MAGIC **Listing 2.1:** Not applicable using Databricks Free +# MAGIC +# MAGIC **Listing 2.2:** Not applicable using Databricks Free +# MAGIC +# MAGIC **Listing 2.3:** Will not work using Databricks Free / Spark Serverless. You can safely ignore +# MAGIC +# MAGIC **Listing 2.5:** Use the location of the volume ins of the (local) location provided in the booktead: `spark.read.text("/Volumes/dapp/gutenberg/data/1342-0.txt")` +# MAGIC +# MAGIC **Listing 2.9:** You can also use `display()` in lieu of `show()`. +# MAGIC +# MAGIC **Listings 3.3 and 3.4:** Change the location of the writing to a Databricks Volume `/Volumes/dapp/gutenberg/data/simple_count.csv` + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC -- Let's create a catalog to store our data +# MAGIC create catalog if not exists `dapp`; +# MAGIC +# MAGIC -- With the catalog created, let's create the schemas +# MAGIC use catalog `dapp`; +# MAGIC +# MAGIC create schema if not exists `broadcast_logs`; +# MAGIC +# MAGIC create schema if not exists `elements`; +# MAGIC +# MAGIC create schema if not exists `gsod_noaa`; +# MAGIC +# MAGIC create schema if not exists `gutenberg`; +# MAGIC +# MAGIC create schema if not exists `list_of_numbers`; +# MAGIC +# MAGIC create schema if not exists `recipes`; +# MAGIC +# MAGIC create schema if not exists `shows`; +# MAGIC +# MAGIC create schema if not exists `window`; + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC create volume if not exists broadcast_logs.data; +# MAGIC create volume if not exists broadcast_logs.ReferenceTables; +# MAGIC create volume if not exists elements.data; +# MAGIC create volume if not exists gsod_noaa.data; +# MAGIC create volume if not exists gutenberg.data; +# MAGIC create volume if not exists list_of_numbers.data; +# MAGIC create volume if not exists recipes.data; +# MAGIC create volume if not exists shows.data; +# MAGIC create volume if not exists window.gsod_light; +# MAGIC create volume if not exists window.gsod; + +# COMMAND ---------- + +# MAGIC %sh +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/BroadcastLogs_2018_Q3_M8_sample.CSV" --output /Volumes/dapp/broadcast_logs/data/BroadcastLogs_2018_Q3_M8_sample.CSV +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/Call_Signs.csv" --output /Volumes/dapp/broadcast_logs/data/Call_Signs.csv + +# COMMAND ---------- + +# MAGIC %sh +# MAGIC +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/ReferenceTables/BroadcastProducers.csv" --output /Volumes/dapp/broadcast_logs/ReferenceTables/BroadcastProducers.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/ReferenceTables/CD_AirLanguage.csv" --output /Volumes/dapp/broadcast_logs/ReferenceTables/CD_AirLanguage.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/ReferenceTables/CD_AudienceTargetAge.csv" --output /Volumes/dapp/broadcast_logs/ReferenceTables/CD_AudienceTargetAge.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/ReferenceTables/CD_AudienceTargetEthnic.csv" --output /Volumes/dapp/broadcast_logs/ReferenceTables/CD_AudienceTargetEthnic.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/ReferenceTables/CD_BroadcastOriginPoint.csv" --output /Volumes/dapp/broadcast_logs/ReferenceTables/CD_BroadcastOriginPoint.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/ReferenceTables/CD_Category.csv" --output /Volumes/dapp/broadcast_logs/ReferenceTables/CD_Category.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/ReferenceTables/CD_ClosedCaption.csv" --output /Volumes/dapp/broadcast_logs/ReferenceTables/CD_ClosedCaption.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/ReferenceTables/CD_Composition.csv" --output /Volumes/dapp/broadcast_logs/ReferenceTables/CD_Composition.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/ReferenceTables/CD_CountryOfOrigin.csv" --output /Volumes/dapp/broadcast_logs/ReferenceTables/CD_CountryOfOrigin.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/ReferenceTables/CD_DubDramaCredit.csv" --output /Volumes/dapp/broadcast_logs/ReferenceTables/CD_DubDramaCredit.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/ReferenceTables/CD_EthnicProgram.csv" --output /Volumes/dapp/broadcast_logs/ReferenceTables/CD_EthnicProgram.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/ReferenceTables/CD_Exhibition.csv" --output /Volumes/dapp/broadcast_logs/ReferenceTables/CD_Exhibition.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/ReferenceTables/CD_FilmClassification.csv" --output /Volumes/dapp/broadcast_logs/ReferenceTables/CD_FilmClassification.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/ReferenceTables/CD_NetworkAffiliation.csv" --output /Volumes/dapp/broadcast_logs/ReferenceTables/CD_NetworkAffiliation.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/ReferenceTables/CD_ProductionSource.csv" --output /Volumes/dapp/broadcast_logs/ReferenceTables/CD_ProductionSource.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/ReferenceTables/CD_ProgramClass.csv" --output /Volumes/dapp/broadcast_logs/ReferenceTables/CD_ProgramClass.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/ReferenceTables/CD_SpecialAttention.csv" --output /Volumes/dapp/broadcast_logs/ReferenceTables/CD_SpecialAttention.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/broadcast_logs/ReferenceTables/LogIdentifier.csv" --output /Volumes/dapp/broadcast_logs/ReferenceTables/LogIdentifier.csv + +# COMMAND ---------- + +# MAGIC %sh +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/elements/Periodic_Table_Of_Elements.csv" --output /Volumes/dapp/elements/data/Periodic_Table_Of_Elements.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/gsod_noaa/gsod2010.parquet" --output /Volumes/dapp/gsod_noaa/data/gsod2010.parquet +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/gsod_noaa/gsod2011.parquet" --output /Volumes/dapp/gsod_noaa/data/gsod2011.parquet +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/gsod_noaa/gsod2012.parquet" --output /Volumes/dapp/gsod_noaa/data/gsod2012.parquet +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/gsod_noaa/gsod2013.parquet" --output /Volumes/dapp/gsod_noaa/data/gsod2013.parquet +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/gsod_noaa/gsod2014.parquet" --output /Volumes/dapp/gsod_noaa/data/gsod2014.parquet +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/gsod_noaa/gsod2015.parquet" --output /Volumes/dapp/gsod_noaa/data/gsod2015.parquet +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/gsod_noaa/gsod2016.parquet" --output /Volumes/dapp/gsod_noaa/data/gsod2016.parquet +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/gsod_noaa/gsod2017.parquet" --output /Volumes/dapp/gsod_noaa/data/gsod2017.parquet +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/gsod_noaa/gsod2018.parquet" --output /Volumes/dapp/gsod_noaa/data/gsod2018.parquet +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/gsod_noaa/gsod2019.parquet" --output /Volumes/dapp/gsod_noaa/data/gsod2019.parquet +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/gsod_noaa/gsod2020.parquet" --output /Volumes/dapp/gsod_noaa/data/gsod2020.parquet + +# COMMAND ---------- + +# MAGIC %sh +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/gutenberg_books/11-0.txt" --output /Volumes/dapp/gutenberg/data/11-0.txt +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/gutenberg_books/1342-0.txt" --output /Volumes/dapp/gutenberg/data/1342-0.txt +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/gutenberg_books/1661-0.txt" --output /Volumes/dapp/gutenberg/data/1661-0.txt +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/gutenberg_books/2701-0.txt" --output /Volumes/dapp/gutenberg/data/2701-0.txt +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/gutenberg_books/30254-0.txt" --output /Volumes/dapp/gutenberg/data/30254-0.txt +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/gutenberg_books/84-0.txt" --output /Volumes/dapp/gutenberg/data/84-0.txt +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/list_of_numbers/sample.csv" --output /Volumes/dapp/list_of_numbers/data/sample.csv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/recipes/epi_r.csv" --output /Volumes/dapp/recipes/data/epi_r.csv + +# COMMAND ---------- + +# MAGIC %sh +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/window/gsod_light.parquet/part-00000-49044fc1-973b-4ca6-a071-621e170220a9-c000.snappy.parquet" --output /Volumes/dapp/window/gsod_light/part-00000-49044fc1-973b-4ca6-a071-621e170220a9-c000.snappy.parquet +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/window/gsod.parquet/part-00000-6ccefe44-8397-4020-b02f-832dd89c20a6-c000.snappy.parquet" --output /Volumes/dapp/window/gsod/part-00000-6ccefe44-8397-4020-b02f-832dd89c20a6-c000.snappy.parquet +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/window/gsod.parquet/part-00001-6ccefe44-8397-4020-b02f-832dd89c20a6-c000.snappy.parquet" --output /Volumes/dapp/window/gsod/part-00001-6ccefe44-8397-4020-b02f-832dd89c20a6-c000.snappy.parquet +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/window/gsod.parquet/part-00002-6ccefe44-8397-4020-b02f-832dd89c20a6-c000.snappy.parquet" --output /Volumes/dapp/window/gsod/part-00002-6ccefe44-8397-4020-b02f-832dd89c20a6-c000.snappy.parquet +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/window/gsod.parquet/part-00003-6ccefe44-8397-4020-b02f-832dd89c20a6-c000.snappy.parquet" --output /Volumes/dapp/window/gsod/part-00003-6ccefe44-8397-4020-b02f-832dd89c20a6-c000.snappy.parquet +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/shows/pokedex.dsv" --output /Volumes/dapp/shows/data/pokedev.dsv +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/shows/shows-breaking-bad.json" --output /Volumes/dapp/shows/data/shows-breaking-bad.json +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/shows/shows-golden-girls.json" --output /Volumes/dapp/shows/data/shows-golden-girls.json +# MAGIC curl "https://raw.githubusercontent.com/jonesberg/DataAnalysisWithPythonAndPySpark-Data/refs/heads/trunk/shows/shows-silicon-valley.json" --output /Volumes/dapp/shows/data/shows-silicon-valley.json + +# COMMAND ---------- + +spark.read.parquet("/Volumes/dapp/window/gsod_light/*.parquet").write.mode("overwrite").saveAsTable("dapp.window.gsod_light") +spark.read.parquet("/Volumes/dapp/window/gsod/*.parquet").write.mode("overwrite").saveAsTable("dapp.window.gsod") + +# COMMAND ---------- + diff --git a/README.md b/README.md index 7db8bc2..f3fea8e 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,13 @@ This is the companion repository for the _Data Analysis with Python and PySpark_ book (Manning, 2022). It contains the source code and data download scripts, when pertinent. -## Get the data +## NEW (June 2025): Databricks Free + +With Databricks offering [free access](https://www.databricks.com/blog/introducing-databricks-free-edition) of most important functionalities, you can now avoid installing (and paying) for your own version. I've created [a notebook/file](DownloadsDatabricksFree.py) you can use to get all the data in tables and volumes. Five minutes and you're ready to work through the code examples, no fuss! + +Just clone the repository in databricks and open the data download notebook. + +## Get the data (old version, still works) The complete data set for the book hovers at around ~1GB. Because of this, [I moved the data sources to another repository](