diff --git a/Gemfile b/Gemfile index fd51732..f0f42ae 100644 --- a/Gemfile +++ b/Gemfile @@ -1,18 +1,21 @@ source "https://rubygems.org/" -gem "jekyll", "~> 3.8.5" +#gem "jekyll", "~> 4.0" gem 'kramdown' gem 'rouge' # If you want to use GitHub Pages, remove the "gem "jekyll"" above and # uncomment the line below. To upgrade, run `bundle update github-pages`. -gem "github-pages", "~> 201", group: :jekyll_plugins +gem "github-pages", group: :jekyll_plugins # If you have any plugins, put them here! group :jekyll_plugins do - gem "jekyll-feed", "~> 0.11" + gem "jekyll-feed" + gem 'jekyll-paginate' end +gem 'webrick' + # Windows and JRuby does not include zoneinfo files, so bundle the tzinfo-data gem # and associated library. install_if -> { RUBY_PLATFORM =~ %r!mingw|mswin|java! } do diff --git a/Gemfile.lock b/Gemfile.lock index 4c59211..9859e75 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,11 +1,12 @@ GEM remote: https://rubygems.org/ specs: - activesupport (4.2.11.1) - i18n (~> 0.7) + activesupport (6.0.6.1) + concurrent-ruby (~> 1.0, >= 1.0.2) + i18n (>= 0.7, < 2) minitest (~> 5.1) - thread_safe (~> 0.3, >= 0.3.4) tzinfo (~> 1.1) + zeitwerk (~> 2.2, >= 2.2.2) addressable (2.8.0) public_suffix (>= 2.0.2, < 5.0) coffee-script (2.4.1) @@ -13,216 +14,239 @@ GEM execjs coffee-script-source (1.11.1) colorator (1.1.0) - commonmarker (0.17.13) - ruby-enum (~> 0.5) - concurrent-ruby (1.1.5) - dnsruby (1.61.3) - addressable (~> 2.5) - em-websocket (0.5.1) + commonmarker (0.23.10) + concurrent-ruby (1.2.0) + dnsruby (1.61.9) + simpleidn (~> 0.1) + em-websocket (0.5.3) eventmachine (>= 0.12.9) - http_parser.rb (~> 0.6.0) - ethon (0.12.0) - ffi (>= 1.3.0) + http_parser.rb (~> 0) + ethon (0.15.0) + ffi (>= 1.15.0) eventmachine (1.2.7) - execjs (2.7.0) - faraday (0.17.0) + execjs (2.8.1) + faraday (1.10.0) + faraday-em_http (~> 1.0) + faraday-em_synchrony (~> 1.0) + faraday-excon (~> 1.1) + faraday-httpclient (~> 1.0) + faraday-multipart (~> 1.0) + faraday-net_http (~> 1.0) + faraday-net_http_persistent (~> 1.0) + faraday-patron (~> 1.0) + faraday-rack (~> 1.0) + faraday-retry (~> 1.0) + ruby2_keywords (>= 0.0.4) + faraday-em_http (1.0.0) + faraday-em_synchrony (1.0.0) + faraday-excon (1.1.0) + faraday-httpclient (1.0.1) + faraday-multipart (1.0.3) multipart-post (>= 1.2, < 3) - ffi (1.11.1) + faraday-net_http (1.0.1) + faraday-net_http_persistent (1.2.0) + faraday-patron (1.0.0) + faraday-rack (1.0.0) + faraday-retry (1.0.3) + ffi (1.15.5) forwardable-extended (2.6.0) gemoji (3.0.1) - github-pages (201) - activesupport (= 4.2.11.1) - github-pages-health-check (= 1.16.1) - jekyll (= 3.8.5) - jekyll-avatar (= 0.6.0) + github-pages (226) + github-pages-health-check (= 1.17.9) + jekyll (= 3.9.2) + jekyll-avatar (= 0.7.0) jekyll-coffeescript (= 1.1.1) - jekyll-commonmark-ghpages (= 0.1.6) + jekyll-commonmark-ghpages (= 0.2.0) jekyll-default-layout (= 0.1.4) - jekyll-feed (= 0.11.0) + jekyll-feed (= 0.15.1) jekyll-gist (= 1.5.0) - jekyll-github-metadata (= 2.12.1) - jekyll-mentions (= 1.4.1) - jekyll-optional-front-matter (= 0.3.0) + jekyll-github-metadata (= 2.13.0) + jekyll-include-cache (= 0.2.1) + jekyll-mentions (= 1.6.0) + jekyll-optional-front-matter (= 0.3.2) jekyll-paginate (= 1.1.0) - jekyll-readme-index (= 0.2.0) - jekyll-redirect-from (= 0.14.0) - jekyll-relative-links (= 0.6.0) - jekyll-remote-theme (= 0.4.0) + jekyll-readme-index (= 0.3.0) + jekyll-redirect-from (= 0.16.0) + jekyll-relative-links (= 0.6.1) + jekyll-remote-theme (= 0.4.3) jekyll-sass-converter (= 1.5.2) - jekyll-seo-tag (= 2.5.0) - jekyll-sitemap (= 1.2.0) - jekyll-swiss (= 0.4.0) - jekyll-theme-architect (= 0.1.1) - jekyll-theme-cayman (= 0.1.1) - jekyll-theme-dinky (= 0.1.1) - jekyll-theme-hacker (= 0.1.1) - jekyll-theme-leap-day (= 0.1.1) - jekyll-theme-merlot (= 0.1.1) - jekyll-theme-midnight (= 0.1.1) - jekyll-theme-minimal (= 0.1.1) - jekyll-theme-modernist (= 0.1.1) - jekyll-theme-primer (= 0.5.3) - jekyll-theme-slate (= 0.1.1) - jekyll-theme-tactile (= 0.1.1) - jekyll-theme-time-machine (= 0.1.1) - jekyll-titles-from-headings (= 0.5.1) - jemoji (= 0.10.2) - kramdown (= 1.17.0) - liquid (= 4.0.0) - listen (= 3.1.5) + jekyll-seo-tag (= 2.8.0) + jekyll-sitemap (= 1.4.0) + jekyll-swiss (= 1.0.0) + jekyll-theme-architect (= 0.2.0) + jekyll-theme-cayman (= 0.2.0) + jekyll-theme-dinky (= 0.2.0) + jekyll-theme-hacker (= 0.2.0) + jekyll-theme-leap-day (= 0.2.0) + jekyll-theme-merlot (= 0.2.0) + jekyll-theme-midnight (= 0.2.0) + jekyll-theme-minimal (= 0.2.0) + jekyll-theme-modernist (= 0.2.0) + jekyll-theme-primer (= 0.6.0) + jekyll-theme-slate (= 0.2.0) + jekyll-theme-tactile (= 0.2.0) + jekyll-theme-time-machine (= 0.2.0) + jekyll-titles-from-headings (= 0.5.3) + jemoji (= 0.12.0) + kramdown (= 2.3.2) + kramdown-parser-gfm (= 1.1.0) + liquid (= 4.0.3) mercenary (~> 0.3) - minima (= 2.5.0) - nokogiri (>= 1.10.4, < 2.0) - rouge (= 3.11.0) + minima (= 2.5.1) + nokogiri (>= 1.13.4, < 2.0) + rouge (= 3.26.0) terminal-table (~> 1.4) - github-pages-health-check (1.16.1) + github-pages-health-check (1.17.9) addressable (~> 2.3) dnsruby (~> 1.60) octokit (~> 4.0) - public_suffix (~> 3.0) + public_suffix (>= 3.0, < 5.0) typhoeus (~> 1.3) - html-pipeline (2.12.0) + html-pipeline (2.14.1) activesupport (>= 2) nokogiri (>= 1.4) - http_parser.rb (0.6.0) + http_parser.rb (0.8.0) i18n (0.9.5) concurrent-ruby (~> 1.0) - jekyll (3.8.5) + jekyll (3.9.2) addressable (~> 2.4) colorator (~> 1.0) em-websocket (~> 0.5) i18n (~> 0.7) jekyll-sass-converter (~> 1.0) jekyll-watch (~> 2.0) - kramdown (~> 1.14) + kramdown (>= 1.17, < 3) liquid (~> 4.0) mercenary (~> 0.3.3) pathutil (~> 0.9) rouge (>= 1.7, < 4) safe_yaml (~> 1.0) - jekyll-avatar (0.6.0) - jekyll (~> 3.0) + jekyll-avatar (0.7.0) + jekyll (>= 3.0, < 5.0) jekyll-coffeescript (1.1.1) coffee-script (~> 2.2) coffee-script-source (~> 1.11.1) - jekyll-commonmark (1.3.1) - commonmarker (~> 0.14) - jekyll (>= 3.7, < 5.0) - jekyll-commonmark-ghpages (0.1.6) - commonmarker (~> 0.17.6) - jekyll-commonmark (~> 1.2) + jekyll-commonmark (1.4.0) + commonmarker (~> 0.22) + jekyll-commonmark-ghpages (0.2.0) + commonmarker (~> 0.23.4) + jekyll (~> 3.9.0) + jekyll-commonmark (~> 1.4.0) rouge (>= 2.0, < 4.0) jekyll-default-layout (0.1.4) jekyll (~> 3.0) - jekyll-feed (0.11.0) - jekyll (~> 3.3) + jekyll-feed (0.15.1) + jekyll (>= 3.7, < 5.0) jekyll-gist (1.5.0) octokit (~> 4.2) - jekyll-github-metadata (2.12.1) - jekyll (~> 3.4) + jekyll-github-metadata (2.13.0) + jekyll (>= 3.4, < 5.0) octokit (~> 4.0, != 4.4.0) - jekyll-mentions (1.4.1) + jekyll-include-cache (0.2.1) + jekyll (>= 3.7, < 5.0) + jekyll-mentions (1.6.0) html-pipeline (~> 2.3) - jekyll (~> 3.0) - jekyll-optional-front-matter (0.3.0) - jekyll (~> 3.0) + jekyll (>= 3.7, < 5.0) + jekyll-optional-front-matter (0.3.2) + jekyll (>= 3.0, < 5.0) jekyll-paginate (1.1.0) - jekyll-readme-index (0.2.0) - jekyll (~> 3.0) - jekyll-redirect-from (0.14.0) - jekyll (~> 3.3) - jekyll-relative-links (0.6.0) - jekyll (~> 3.3) - jekyll-remote-theme (0.4.0) + jekyll-readme-index (0.3.0) + jekyll (>= 3.0, < 5.0) + jekyll-redirect-from (0.16.0) + jekyll (>= 3.3, < 5.0) + jekyll-relative-links (0.6.1) + jekyll (>= 3.3, < 5.0) + jekyll-remote-theme (0.4.3) addressable (~> 2.0) - jekyll (~> 3.5) - rubyzip (>= 1.2.1, < 3.0) + jekyll (>= 3.5, < 5.0) + jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0) + rubyzip (>= 1.3.0, < 3.0) jekyll-sass-converter (1.5.2) sass (~> 3.4) - jekyll-seo-tag (2.5.0) - jekyll (~> 3.3) - jekyll-sitemap (1.2.0) - jekyll (~> 3.3) - jekyll-swiss (0.4.0) - jekyll-theme-architect (0.1.1) - jekyll (~> 3.5) + jekyll-seo-tag (2.8.0) + jekyll (>= 3.8, < 5.0) + jekyll-sitemap (1.4.0) + jekyll (>= 3.7, < 5.0) + jekyll-swiss (1.0.0) + jekyll-theme-architect (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-cayman (0.1.1) - jekyll (~> 3.5) + jekyll-theme-cayman (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-dinky (0.1.1) - jekyll (~> 3.5) + jekyll-theme-dinky (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-hacker (0.1.1) - jekyll (~> 3.5) + jekyll-theme-hacker (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-leap-day (0.1.1) - jekyll (~> 3.5) + jekyll-theme-leap-day (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-merlot (0.1.1) - jekyll (~> 3.5) + jekyll-theme-merlot (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-midnight (0.1.1) - jekyll (~> 3.5) + jekyll-theme-midnight (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-minimal (0.1.1) - jekyll (~> 3.5) + jekyll-theme-minimal (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-modernist (0.1.1) - jekyll (~> 3.5) + jekyll-theme-modernist (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-primer (0.5.3) - jekyll (~> 3.5) + jekyll-theme-primer (0.6.0) + jekyll (> 3.5, < 5.0) jekyll-github-metadata (~> 2.9) jekyll-seo-tag (~> 2.0) - jekyll-theme-slate (0.1.1) - jekyll (~> 3.5) + jekyll-theme-slate (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-tactile (0.1.1) - jekyll (~> 3.5) + jekyll-theme-tactile (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-time-machine (0.1.1) - jekyll (~> 3.5) + jekyll-theme-time-machine (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-titles-from-headings (0.5.1) - jekyll (~> 3.3) + jekyll-titles-from-headings (0.5.3) + jekyll (>= 3.3, < 5.0) jekyll-watch (2.2.1) listen (~> 3.0) - jemoji (0.10.2) + jemoji (0.12.0) gemoji (~> 3.0) html-pipeline (~> 2.2) - jekyll (~> 3.0) - kramdown (1.17.0) - liquid (4.0.0) - listen (3.1.5) - rb-fsevent (~> 0.9, >= 0.9.4) - rb-inotify (~> 0.9, >= 0.9.7) - ruby_dep (~> 1.2) + jekyll (>= 3.0, < 5.0) + kramdown (2.3.2) + rexml + kramdown-parser-gfm (1.1.0) + kramdown (~> 2.0) + liquid (4.0.3) + listen (3.7.1) + rb-fsevent (~> 0.10, >= 0.10.3) + rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.3.6) - mini_portile2 (2.5.1) - minima (2.5.0) - jekyll (~> 3.5) + minima (2.5.1) + jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) - minitest (5.12.2) + minitest (5.17.0) multipart-post (2.1.1) - nokogiri (1.11.5) - mini_portile2 (~> 2.5.0) + nokogiri (1.16.5-x86_64-linux) racc (~> 1.4) - octokit (4.14.0) + octokit (4.22.0) + faraday (>= 0.9) sawyer (~> 0.8.0, >= 0.5.3) pathutil (0.16.2) forwardable-extended (~> 2.6) - public_suffix (3.1.1) - racc (1.5.2) - rb-fsevent (0.10.3) - rb-inotify (0.10.0) + public_suffix (4.0.7) + racc (1.7.3) + rb-fsevent (0.11.1) + rb-inotify (0.10.1) ffi (~> 1.0) - rouge (3.11.0) - ruby-enum (0.7.2) - i18n - ruby_dep (1.5.0) - rubyzip (2.0.0) + rexml (3.3.9) + rouge (3.26.0) + ruby2_keywords (0.0.5) + rubyzip (2.3.2) safe_yaml (1.0.5) sass (3.7.4) sass-listen (~> 4.0.0) @@ -232,30 +256,38 @@ GEM sawyer (0.8.2) addressable (>= 2.3.5) faraday (> 0.8, < 2.0) + simpleidn (0.2.1) + unf (~> 0.1.4) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) thread_safe (0.3.6) - typhoeus (1.3.1) + typhoeus (1.4.0) ethon (>= 0.9.0) - tzinfo (1.2.5) + tzinfo (1.2.10) thread_safe (~> 0.1) - tzinfo-data (1.2019.3) + tzinfo-data (1.2022.1) tzinfo (>= 1.0.0) - unicode-display_width (1.6.0) + unf (0.1.4) + unf_ext + unf_ext (0.0.8.1) + unicode-display_width (1.8.0) wdm (0.1.1) + webrick (1.8.2) + zeitwerk (2.6.6) PLATFORMS - ruby + x86_64-linux DEPENDENCIES - github-pages (~> 201) - jekyll (~> 3.8.5) - jekyll-feed (~> 0.11) + github-pages + jekyll-feed + jekyll-paginate kramdown rouge tzinfo (~> 1.2) tzinfo-data wdm (~> 0.1.1) + webrick BUNDLED WITH - 2.0.2 + 2.3.8 diff --git a/_data/authors.yml b/_data/authors.yml index 1ffec72..69d8120 100644 --- a/_data/authors.yml +++ b/_data/authors.yml @@ -3,6 +3,13 @@ # description, etc --- +bshaw: + name: Ben Shaw + github: benshaw + twitter: ben_a_shaw + about: | + Ben leads the ML Platform group, helping scale production Machine Learning at scribd. Other times you will find him outside playing in the mountains. + alexjb: name: Alex Bernardin github: alexofmanytrades diff --git a/_data/team-structure.yml b/_data/team-structure.yml index 27688e3..90f26e4 100644 --- a/_data/team-structure.yml +++ b/_data/team-structure.yml @@ -91,3 +91,8 @@ description: | The Web QA team strives for a defect-free Scribd website known for its reliability. + +- team: Service Foundations + description: | + The Service Foudations team provides reliable, high-quality, scalable service foundations + that teams can leverage to easily build, deploy and monitor self-owned, distributed services. diff --git a/_layouts/home.html b/_layouts/home.html index b92aa95..3439c1f 100644 --- a/_layouts/home.html +++ b/_layouts/home.html @@ -21,7 +21,7 @@

Help us build our next project.

-

We're on a mission to change the way the world reads. That's an ambitious task, and we need ambitious people to get us there. See what positions are available and start your next chapter today.

+

We're on a mission to build the largest and most accessible library connecting storytellers with their audience. That's an ambitious task, and we need ambitious people to get us there. See what positions are available and start your next chapter today.

All Positions diff --git a/_posts/2020-04-29-monitoring-aws-with-panther.md b/_posts/2020-04-29-monitoring-aws-with-panther.md index 6866b4d..da5ceec 100644 --- a/_posts/2020-04-29-monitoring-aws-with-panther.md +++ b/_posts/2020-04-29-monitoring-aws-with-panther.md @@ -5,10 +5,13 @@ tags: - monitoring - aws - featured +- archived team: Security Engineering author: paha --- +***NOTE***: *Scribd’s security infrastructure has since evolved away from using Panther* + Before widespread cloud usage, it was uncommon for one person to be present for the entire datacenter development lifecycle. Very few people knew how to design and build a datacenter from scratch while ensuring appropriate security configuration settings were set, on top of rigging up monitoring. It was even more uncommon for non-sysadmins to have any involvement in data center infrastructure construction or ongoing refinement. The cloud is very different. It only takes seconds to create an entire infrastructure from a template. And even developers are doing it! The monitoring challenges for such a scenario are significant. There aren't necessarily "more" monitoring data points, but the speed with which infrastructure can be created tends to result in infrastructure getting way out over its skis with respect to monitoring. Furthermore, since many barriers to entry for doing stupid things have been lowered to the point of non-existence, monitoring is the last great hope of maintaining control over a cloud environment. While access controls can still provide some guardrails, the flexibility that all engineers need to do their jobs requires that they have the ability to do "dangerous" things that they've never had to do before. The true definition of "full stack" has expanded. diff --git a/_posts/2022-04-28-data-ai-summit-2022.md b/_posts/2022-04-28-data-ai-summit-2022.md new file mode 100644 index 0000000..8916901 --- /dev/null +++ b/_posts/2022-04-28-data-ai-summit-2022.md @@ -0,0 +1,45 @@ +--- +layout: post +title: "Scribd is presenting at Data and AI Summit 2022" +team: Core Platform +author: rtyler +tags: +- databricks +- kafka +- deltalake +- featured +--- + +We are very excited to be presenting and attending this year's [Data and AI +Summit](https://databricks.com/dataaisummit/north-america-2022) which will be +hosted virtually and physically in San Francisco from June 27th-30th. +Throughout the course of 2021 we completed a number of really interesting +projects built around [delta-rs](https://github.com/delta-io/delta-rs) and the +Databricks platform which we are thrilled to share with a broader audience. +In addition to the presentations listed below, a number of Scribd engineers who +are responsible for data and ML platform, machine learning systems, and more, +will be in attendance if you want to meet up and learn more about how Scribd +uses data and ML to change the way the world reads! + + +* [Christian Williams](https://github.com/xianwill) will be sharing some of the +work he has done developing +[kafka-delta-ingest](https://github.com/scribd/kafka-delta-ingest) in his talk: +**[Streaming Data into Delta Lake with Rust and +Kafka](https://databricks.com/dataaisummit/north-america-2022/agenda/?sessionid=1834)** +* [QP Hou](https://github.com/houqp), Scribd Emeritus, will be presenting on +his foundational work to ensure correctness within delta-rs during his session: +**[Ensuring Correct Distributed Writes to Delta Lake in Rust with Formal +Verification](https://databricks.com/dataaisummit/north-america-2022/agenda/?sessionid=1623)** +* [R Tyler Croy](https://github.com/rtyler) will be co-presenting with Gavin +Edgley from Databricks on the cost analysis work Scribd has done to efficiently +grow our data platform with **[Doubling the size of the data lake without doubling the +cost](https://databricks.com/dataaisummit/north-america-2022/agenda/?sessionid=2366)** + + +There are so many great sessions to watch in person or online during the event, +particularly around [Delta Lake](https://delta.io), which is one of our +favorite technologies and powers our entire data platform. We are also +expecting some great ML related talks as data and ML begin to overlap more and +more. We hope to see you there! + diff --git a/_posts/2022-06-28-databricks-serverless.md b/_posts/2022-06-28-databricks-serverless.md new file mode 100644 index 0000000..1007238 --- /dev/null +++ b/_posts/2022-06-28-databricks-serverless.md @@ -0,0 +1,58 @@ +--- +layout: post +title: "Accelerating Looker with Databricks SQL Serverless" +tags: +- looker +- databricks +- featured +team: Core Platform +author: hamiltonh +--- + +We recently migrated Looker to a Databricks SQL Serverless, improving our +infrastructure cost and reducing the footprint of infrastructure we need to +worry about! “Databricks SQL” which provides a single load balanced Warehouse +for executing Spark SQL queries across multiple Spark clusters behind the +scenes. “Serverless” is an evolution of that concept, rather than running a SQL +Warehouse in our AWS infrastructure, the entirety of execution happens on the +Databricks side. With a much simpler and faster interface, queries executed in +Looker now return results much faster to our users than ever before! + +When we originally provisioned our “Databricks SQL” warehouses, we worked +together with our colleagues at Databricks to ensure [the terraform provider +for Databricks](https://github.com/databricks/terraform-provider-databricks) is +ready for production usage, which as of today is Generally Available. That +original foundation in Terraform allowed us to more easily adopt SQL Serverless +once it was made available to us. + +```hcl +resource "databricks_sql_warehouse" "warehouse" { + name = "Looker Serverless" + # ... + enable_serverless_compute = true + # ... +} +``` + +The feature was literally brand new so there were a few integration hurdles we +had to work through with our colleagues at Databricks, but we got things up and +running in short order. By adopting SQL Serverless, we could avoid setting up +special networking, IAM roles, and other resources within our own AWS account, +we can instead rely on pre-provisioned compute resources within Databricks' own +infrastructure. No more headache of ensuring all of the required infra is in +place and setup correctly! + +The switch to Serverless reduced our infra configuration and management +footprint, which by itself is an improvement. We also noticed a significant +reduction in cold start times for the SQL Serverless Warehouse compared to the +standard SQL Warehouse. The faster start-up times meant we could configure even +lower auto-terminate times on the warehouse, savings us even more on +unproductive and idle cluster costs. + +On the Looker side there really wasn’t any difference in the connection +configuration other than a URL change. In the end, after some preparation work +a simple 5 minute change in Looker, and a simple 5 minute change in Terraform +switched everything over to Databricks SQL Serverless, and we were ready to +rock! Our BI team is very happy with the performance, especially on cold start +queries. Our CFO is happy about reducing infrastructure costs. And I’m happy +about simpler infrastructure! diff --git a/_posts/2022-07-21-data-ai-summit-videos.md b/_posts/2022-07-21-data-ai-summit-videos.md new file mode 100644 index 0000000..828f149 --- /dev/null +++ b/_posts/2022-07-21-data-ai-summit-videos.md @@ -0,0 +1,45 @@ +--- +layout: post +title: "Data and AI Summit Wrap-up" +team: Core Platform +author: rtyler +tags: +- databricks +- kafka +- deltalake +- featured +--- + +We brought a whole team to San Francisco to present and attend this year's Data and +AI Summit, and it was a blast! +I +would consider the event a success both in the attendance to the Scribd hosted +talks and the number of talks which discussed patterns we have adopted in our +own data and ML platform. +The three talks I [wrote about +previously](/blog/2022/data-ai-summit-2022.html) were well received and have +since been posted to YouTube along with _hundreds_ of other talks. + +* [Christian Williams](https://github.com/xianwill) shared some of the +work he has done developing +[kafka-delta-ingest](https://github.com/scribd/kafka-delta-ingest) in his talk: +[![Streaming Data into Delta Lake with Rust and Kafka](https://img.youtube.com/vi/do4jsxeKfd4/hqdefault.jpg)](https://www.youtube.com/watch?v=do4jsxeKfd4&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=195) +* [QP Hou](https://github.com/houqp), Scribd Emeritus, presented on +his foundational work to ensure correctness within delta-rs during his session: +[![Ensuring Correct Distributed Writes to Delta Lake in Rust with Formal +Verification](https://img.youtube.com/vi/ABoCnrVWCKY/hqdefault.jpg)](https://www.youtube.com/watch?v=ABoCnrVWCKY&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=112) +* [R Tyler Croy](https://github.com/rtyler) co-presented with Gavin +Edgley from Databricks on the cost analysis work Scribd has done to efficiently +grow our data platform with: +[![Doubling the size of the data lake without doubling the cost](https://img.youtube.com/vi/9QDRD0PzqCE/hqdefault.jpg)](https://www.youtube.com/watch?v=9QDRD0PzqCE&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=122) + +Members of the Scribd team participated in a panel to discuss the past, +present, and future of Delta Lake on the expo floor. We also took advantage of +the time to have multiple discussions with our colleagues at Databricks about +their product and engineering roadmap, and where we can work together to +improve the future of Delta Lake, Unity catalog, and more. + +For those working in the data, ML, or infrastructure space, there are a lot of +_great_ talks available online from the event, which I highly recommend +checking out. Data and AI Summit is a great event for leaders in the industry +to get together, so we'll definitely be back next year! diff --git a/_posts/2024-02-05-evolution-of-mlplatform.md b/_posts/2024-02-05-evolution-of-mlplatform.md new file mode 100644 index 0000000..37f22c2 --- /dev/null +++ b/_posts/2024-02-05-evolution-of-mlplatform.md @@ -0,0 +1,133 @@ +--- +layout: post +title: "The Evolution of the Machine Learning Platform" +team: Machine Learning Platform +author: bshaw +tags: +- mlops +- featured +- ml-platform-series +--- + +Machine Learning Platforms (ML Platforms) have the potential to be a key component in achieving production ML at scale without large technical debt, yet ML Platforms are not often understood. This document outlines the key concepts and paradigm shifts that led to the conceptualization of ML Platforms in an effort to increase an understanding of these platforms and how they can best be applied. + + +Technical Debt and development velocity defined +----------------------------------------------- + +### Development Velocity + +Machine learning development velocity refers to the speed and efficiency at which machine learning (ML) projects progress from the initial concept to deployment in a production environment. It encompasses the entire lifecycle of a machine learning project, from data collection and preprocessing to model training, evaluation, validation deployment and testing for new models or for re-training, validation and deployment of existing models. + +### Technical Debt + +The term "technical debt" in software engineering was coined by Ward Cunningham, Cunningham used the metaphor of financial debt to describe the trade-off between implementing a quick and dirty solution to meet immediate needs (similar to taking on financial debt for short-term gain) versus taking the time to do it properly with a more sustainable and maintainable solution (akin to avoiding financial debt but requiring more upfront investment). Just as financial debt accumulates interest over time, technical debt can accumulate and make future development more difficult and expensive. + +The idea behind technical debt is to highlight the consequences of prioritizing short-term gains over long-term maintainability and the need to address and pay off this "debt" through proper refactoring and improvements. The term has since become widely adopted in the software development community to describe the accrued cost of deferred work on a software project. + +### Technical Debt in Machine Learning + +Originally a software engineering concept, Technical debt is also relevant to Machine Learning Systems infact the landmark google paper suggest that ML systems have the propensity to easily gain this technical debt. + +> Machine learning offers a fantastically powerful toolkit for building useful complex prediction systems quickly. This paper argues it is dangerous to think of these quick wins as coming for free. Using the software engineering framework of technical debt , we find it is common to incur massive ongoing maintenance costs in real-world ML systems +> +> [Sculley et al (2021) Hidden Technical Debt in Machine Learning Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) + +> As the machine learning (ML) community continues to accumulate years of experience with livesystems, a wide-spread and uncomfortable trend has emerged: developing and deploying ML sys-tems is relatively fast and cheap, but maintaining them over time is difficult and expensive +> +> [Sculley et al (2021) Hidden Technical Debt in Machine Learning Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) + +Technical debt is important to consider especially when trying to move fast. Moving fast is easy, moving fast without acquiring technical debt is alot more complicated. + +The Evolution Of ML Platforms +----------------------------- + +### DevOps -- The paradigm shift that led the way + +DevOps is a methodology in software development which advocates for teams owning the entire software development lifecycle. This paradigm shift from fragmented teams to end-to-end ownership enhances collaboration and accelerates delivery. Dev ops has become standard practice in modern software development and the adoption of DevOps has been widespread, with many organizations considering it an essential part of their software development and delivery processes. Some of the principles of DevOps are: + +1. **Automation** + +2. **Continuous Testing** + +3. **Continuous Monitoring** + +4. **Collaboration and Communication** + +5. **Version Control** + +6. **Feedback Loops** + + +### Platforms -- Reducing Cognitive Load + +This shift to DevOps and teams teams owning the entire development lifecycle introduces a new challenge—additional cognitive load. Cognitive load can be defined as + +> The total amount of mental effort a team uses to understand, operate and maintain their designated systems or tasks. +> +> [Skelton & Pais (2019) Team Topologies](https://teamtopologies.com/book) + +The weight of the additional load introduced in DevOps of teams owning the entire software development lifecycle can hinder productivity, prompting organizations to seek solutions. + +Platforms emerged as a strategic solution, delicately abstracting unnecessary details of the development lifecycle. This abstraction allows engineers to focus on critical tasks, mitigating cognitive load and fostering a more streamlined workflow. + +> The purpose of a platform team is to enable stream-aligned teams to deliver work with substantial autonomy. The stream-aligned team maintains full ownership of building, running, and fixing their application in production. The platform team provides internal services to reduce the cognitive load that would be required from stream-aligned teams to develop these underlying services. +> +> [Skelton & Pais (2019) Team Topologies](https://teamtopologies.com/book) + +> Infrastructure Platform teams enable organisations to scale delivery by solving common product and non-functional requirements with resilient solutions. This allows other teams to focus on building their own things and releasing value for their users +> +> [Rowse & Shepherd (2022) Building Infrastructure Platforms](https://martinfowler.com/articles/building-infrastructure-platform.html) + +### ML Ops -- Reducing technical debt of machine learning + +The ability of ML systems to rapidly accumulate technical debt has given rise to the concept of MLOps. MLOps is a methodology that takes inspiration from and incorporates best practices of the DevOps, tailoring them to address the distinctive challenges inherent in machine learning. MLOps applies the established principles of DevOps to machine learning, recognizing that merely a fraction of real-world ML systems comprises the actual ML code. Serving as a crucial bridge between development and the ongoing intricacies of maintaining ML systems. +MLOps is a methodology that provides a collection of concepts and workflows designed to promote efficiency, collaboration, and sustainability of the ML Lifecycle. Correctly applied MLOps can play a pivotal role controlling technical debt and ensuring the efficiency, reliability, and scalability of the machine learning lifecycle over time. + +Scribd's ML Platform -- MLOps and Platforms in Action +------------------------------------- +At Scribd we have developed a machine learning platform which provides a curated developer experience for machine learning developers. This platform has been built with MLOps in mind which can be seen through its use of common DevOps principles. + +1. **Automation:** + * Applying CI/CD strategies to model deployments through the use of Jenkins pipelines which deploy models from the Model Registry to AWS based endpoints. + * Automating Model training throug the use of Airflow DAGS and allowing these DAGS to trigger the deployment pipelines to deploy a model once re-training has occured. + +2. **Continuous** **Testing:** + * Applying continuous testing as part of a model deployment pipeline, removing the need for manual testing. + * Increased tooling to support model validation testing. + +3. **Monitoring:** + * Monitoring real time inference endpoints + * Monitoring training DAGS + * Monitoring batch jobs + +4. **Collaboration and Communication:** + * Feature Store which provides feature discovery and re-use + * Model Database which provides model collaboration + +6. **Version Control:** + * Applying version control to experiments, machine learning models and features + + +References +---------- + +[Bottcher (2018, March 05). What I Talk About When I Talk About Platforms. https://martinfowler.com/articles/talk-about-platforms.html](https://martinfowler.com/articles/talk-about-platforms.html) + +[D. Sculley, Gary Holt, Daniel Golovin, Eugene Davydov, Todd Phillips, Dietmar Ebner, Vinay Chaudhary, Michael Young, Jean-Franc¸ois Crespo, Dan Dennison (2021) Hidden Technical Debt in Machine Learning Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) + +[Fowler (2022, October 20).Conway's Law. https://martinfowler.com/bliki/ConwaysLaw.html](https://martinfowler.com/bliki/ConwaysLaw.html) + +[Galante, what is platform engineering. https://platformengineering.org/blog/what-is-platform-engineering](https://platformengineering.org/blog/what-is-platform-engineering) + +[Humanitect, State of Platform Engineering Report](https://www.scribd.com/document/611845499/Whitepaper-State-of-Platform-Engineering-Report) + +[Hodgson (2023, July 19).How platform teams get stuff done. https://martinfowler.com/articles/platform-teams-stuff-done.html](https://martinfowler.com/articles/platform-teams-stuff-done.html) + +[Murray (2017, April 27. The Art of Platform Thinking. https://www.thoughtworks.com/insights/blog/platforms/art-platform-thinking)](https://www.thoughtworks.com/insights/blog/platforms/art-platform-thinking) + +[Rouse (2017, March 20). Technical Debt. https://www.techopedia.com/definition/27913/technical-debt](https://www.techopedia.com/definition/27913/technical-debt) + +[Rowse & Shepherd (2022).Building Infrastructure Platforms. https://martinfowler.com/articles/building-infrastructure-platform.html](https://martinfowler.com/articles/building-infrastructure-platform.html) + +[Skelton & Pais (2019) Team Topologies](https://teamtopologies.com/book) diff --git a/_posts/2025-01-15-cloud-native-data-ingestion.md b/_posts/2025-01-15-cloud-native-data-ingestion.md new file mode 100644 index 0000000..2df1b88 --- /dev/null +++ b/_posts/2025-01-15-cloud-native-data-ingestion.md @@ -0,0 +1,35 @@ +--- +layout: post +title: "Cloud-native Data Ingestion with AWS Aurora and Delta Lake" +team: "Core Infrastructure" +author: rtyler +tags: +- deltalake +- rust +- featured +--- + + +One of the major themes for Infrastructure Engineering over the past couple +years has been higher reliability and better operational efficiency. In a +recent session with the [Delta Lake](https://delta.io) project I was able to +share the work led Kuntal Basu and a number of other people to _dramatically_ +improve the efficiency and reliability of our online data ingestion pipeline. + + +> Join Kuntal Basu, Staff Infrastructure Engineer, and R. Tyler Croy, Principal +> Engineer at Scribd, Inc. as they take you behind the scenes of Scribd’s data +> ingestion setup. They’ll break down the architecture, explain the tools, and +> walk you through how they turned off-the-shelf solutions into a robust +> pipeline. + + +## Video + +
+ + +## Presentation + +
+ diff --git a/_posts/2025-03-14-terraform-oxbow-module.md b/_posts/2025-03-14-terraform-oxbow-module.md new file mode 100644 index 0000000..ab48af2 --- /dev/null +++ b/_posts/2025-03-14-terraform-oxbow-module.md @@ -0,0 +1,61 @@ +--- +layout: post +title: "Terraform module to manage Oxbow Lambda and its components" +tags: +- Oxbow +- Terraform +- AWS +- deltalake +- rust +team: Core Infrastructure +author: Oleh Motrunych +--- + + +[Oxbow](https://github.com/buoyant-data/oxbow) is a project to take an existing storage location which contains [Apache Parquet](https://parquet.apache.org/) files into a [Delta Lake table](https://delta.io/). +It is intended to run both as an AWS Lambda or as a command line application. +We are excited to introduce [terraform-oxbow](https://github.com/scribd/terraform-oxbow), an open-source Terraform module that simplifies the deployment and management of AWS Lambda and its supporting components. Whether you're working with AWS Glue, Kinesis Data Firehose, SQS, or DynamoDB, this module provides a streamlined approach to infrastructure as code (IaC) in AWS. + +### ✨ Why terraform-oxbow? +Managing event-driven architectures in AWS can be complex, requiring careful orchestration of multiple services. Terraform-oxbow abstracts much of this complexity, enabling users to configure key components with simple boolean flags and module parameters. This is an easy and efficient way to have Delta table created using Apache Parquet files. +### 🚀Features + +With **terraform-oxbow**, you can deploy: + +- AWS Oxbow Lambda with customizable configurations +- Kinesis Data Firehose for real-time data streaming +- SQS and SQS Dead Letter Queues for event-driven messaging +- IAM policies for secure access management +- S3 bucket notifications to trigger Lambda functions +- DynamoDB tables for data storage and locking +- AWS Glue Catalog and Tables for schema management + + +### ⚙️ How It Works + +This module follows a modular approach, allowing users to enable or disable services based on their specific use case. Here are a few examples: + +- To enable AWS Glue Catalog and Tables: ```hcl +enable_aws_glue_catalog_table = true +``` + +- To enable Kinesis Data Firehose delivery stream ```hcl +enable_kinesis_firehose_delivery_stream = true +``` + +- To enable S3 bucket notifications ```hcl +enable_bucket_notification = true +``` + +- To enable advanced Oxbow Lambda setup for multi-table filtered optimization ```hcl +enable_group_events = true +``` + +- AWS S3 bucket notifications have limitations: Due to AWS constraints, an S3 bucket can only have a single notification configuration per account. If you need to trigger multiple Lambda functions from the same S3 bucket, consider using event-driven solutions like SNS or SQS. + + +- IAM Policy Management: The module provides the necessary permissions but follows the principle of least privilege. Ensure your IAM policies align with your security requirements. + + +- Scalability and Optimization: The module allows fine-grained control over Lambda concurrency, event filtering, and data processing configurations to optimize costs and performance + diff --git a/assets/js/jobs.js b/assets/js/jobs.js index 07d3b18..8172cb9 100644 --- a/assets/js/jobs.js +++ b/assets/js/jobs.js @@ -5,7 +5,7 @@ * * With that disclaimer out of the way... * - * This file handles the fetching of jobs from Lever such that they can be + * This file handles the fetching of jobs from Lever^WAshby such that they can be * dynamically inserted into different parts of the tech blog */ @@ -13,7 +13,7 @@ * This API will return an list of departments which must then be filtered * through to find the .postings under each */ -const API_URL = 'https://api.lever.co/v0/postings/scribd?group=department&mode=json' +const API_URL = 'https://api.ashbyhq.com/posting-api/job-board/scribd?includeCompensation=true' /* @@ -37,21 +37,20 @@ function fetchJobs() { return fetch(API_URL) .then(async (response) => { - const departments = await response.json(); + const board = await response.json(); /* * Since this is the tech blog, we're only pulling a couple of * departments */ - departments - .filter(d => ['Engineering', 'Data Science', 'Design', 'Business Analytics', 'Product'].includes(d.title)) - .forEach((department) => { - department.postings.forEach((posting) => { - const team = posting.categories.team; + board.jobs + .filter(j => ['Engineering', 'Product, Design, & Analytics', 'Product'].includes(j.department)) + .filter(j => !j.title.toLowerCase().includes('marketing')) + .forEach((job) => { + const team = job.team; if (!window.jobsCache[team]) { window.jobsCache[team] = []; } - window.jobsCache[team].push(posting); - }); + window.jobsCache[team].push(job); }); window.jobsFetched = true; return window.jobsCache; @@ -98,9 +97,9 @@ function renderJobs(elem, team, randomLimit) { li.innerHTML = `
- ${job.text} + ${job.title}
-

${job.categories.location || ''}

+

${job.location || ''}

`; elem.appendChild(li); diff --git a/careers.html b/careers.html index 1965edf..5a5072a 100644 --- a/careers.html +++ b/careers.html @@ -18,7 +18,7 @@ alt="two people sitting around a sofa reading on a computer and tablet">
-

Help us change the way the world reads.

+

Help us build the largest and most accessible library connecting storytellers with their audience.

Our readers are on a mission to become their best selves, and so are we. We’re not afraid to take risks because we know that — win or lose — we’ll learn from them.

If you’re a talented team player and want to work somewhere where your input matters, we’d love to talk with you.

@@ -159,14 +159,14 @@

${team} li.innerHTML = `
- - ${job.text} + + ${job.title}
-

${job.categories.location || ''}

+

${job.location || ''}

- `; diff --git a/tag/mlops/index.md b/tag/mlops/index.md new file mode 100644 index 0000000..b51bead --- /dev/null +++ b/tag/mlops/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: mlops" +tag: mlops +robots: noindex +---