From 34e1d5ba5e1a438fcedec029a19bfa273ee9729f Mon Sep 17 00:00:00 2001 From: Ilija Eftimov <854173+fteem@users.noreply.github.com> Date: Thu, 30 Sep 2021 18:11:12 +0200 Subject: [PATCH 01/55] Add SERF to teams list (#113) --- _data/team-structure.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/_data/team-structure.yml b/_data/team-structure.yml index 27688e3..90f26e4 100644 --- a/_data/team-structure.yml +++ b/_data/team-structure.yml @@ -91,3 +91,8 @@ description: | The Web QA team strives for a defect-free Scribd website known for its reliability. + +- team: Service Foundations + description: | + The Service Foudations team provides reliable, high-quality, scalable service foundations + that teams can leverage to easily build, deploy and monitor self-owned, distributed services. From 67c77beb65c27fd9a95037a0ee669777be3b2164 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 30 Sep 2021 09:14:06 -0700 Subject: [PATCH 02/55] Bump nokogiri from 1.11.5 to 1.12.5 (#112) Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.11.5 to 1.12.5. - [Release notes](https://github.com/sparklemotion/nokogiri/releases) - [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md) - [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.11.5...v1.12.5) --- updated-dependencies: - dependency-name: nokogiri dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Gemfile.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 4c59211..e9e3865 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -199,15 +199,15 @@ GEM rb-inotify (~> 0.9, >= 0.9.7) ruby_dep (~> 1.2) mercenary (0.3.6) - mini_portile2 (2.5.1) + mini_portile2 (2.6.1) minima (2.5.0) jekyll (~> 3.5) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) minitest (5.12.2) multipart-post (2.1.1) - nokogiri (1.11.5) - mini_portile2 (~> 2.5.0) + nokogiri (1.12.5) + mini_portile2 (~> 2.6.1) racc (~> 1.4) octokit (4.14.0) sawyer (~> 0.8.0, >= 0.5.3) From 4911a6da8607ddaffbbcea6dd2994174256c2950 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 26 Feb 2022 06:05:00 +0000 Subject: [PATCH 03/55] Bump nokogiri from 1.12.5 to 1.13.3 Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.12.5 to 1.13.3. - [Release notes](https://github.com/sparklemotion/nokogiri/releases) - [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md) - [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.12.5...v1.13.3) --- updated-dependencies: - dependency-name: nokogiri dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Gemfile.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index e9e3865..6fbfacf 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -199,22 +199,22 @@ GEM rb-inotify (~> 0.9, >= 0.9.7) ruby_dep (~> 1.2) mercenary (0.3.6) - mini_portile2 (2.6.1) + mini_portile2 (2.8.0) minima (2.5.0) jekyll (~> 3.5) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) minitest (5.12.2) multipart-post (2.1.1) - nokogiri (1.12.5) - mini_portile2 (~> 2.6.1) + nokogiri (1.13.3) + mini_portile2 (~> 2.8.0) racc (~> 1.4) octokit (4.14.0) sawyer (~> 0.8.0, >= 0.5.3) pathutil (0.16.2) forwardable-extended (~> 2.6) public_suffix (3.1.1) - racc (1.5.2) + racc (1.6.0) rb-fsevent (0.10.3) rb-inotify (0.10.0) ffi (~> 1.0) From 262fe7b4bff2c7bc889a029c79e6005ce633a6df Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 12 Apr 2022 10:11:30 +0000 Subject: [PATCH 04/55] Bump nokogiri from 1.13.3 to 1.13.4 Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.13.3 to 1.13.4. - [Release notes](https://github.com/sparklemotion/nokogiri/releases) - [Changelog](https://github.com/sparklemotion/nokogiri/blob/v1.13.4/CHANGELOG.md) - [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.13.3...v1.13.4) --- updated-dependencies: - dependency-name: nokogiri dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index 6fbfacf..b3bf972 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -206,7 +206,7 @@ GEM jekyll-seo-tag (~> 2.1) minitest (5.12.2) multipart-post (2.1.1) - nokogiri (1.13.3) + nokogiri (1.13.4) mini_portile2 (~> 2.8.0) racc (~> 1.4) octokit (4.14.0) From 4464c3722d81b3774925d46211dea8b676804226 Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Thu, 28 Apr 2022 09:43:52 -0700 Subject: [PATCH 05/55] Add blog post about our Data and AI Summit presence --- _posts/2022-04-28-data-ai-summit-2022.md | 45 ++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 _posts/2022-04-28-data-ai-summit-2022.md diff --git a/_posts/2022-04-28-data-ai-summit-2022.md b/_posts/2022-04-28-data-ai-summit-2022.md new file mode 100644 index 0000000..7da9c30 --- /dev/null +++ b/_posts/2022-04-28-data-ai-summit-2022.md @@ -0,0 +1,45 @@ +--- +layout: post +title: "We're presenting at Data and AI Summit 2022" +team: Core Platform +author: rtyler +tags: +- databricks +- kafka +- deltalake +- featured +--- + +We are very excited to be presenting and attending this year's [Data and AI +Summit](https://databricks.com/dataaisummit/north-america-2022) which will be +hosted virtually and physically in San Francisco from June 27th-30th. +Throughout the course of 2021 we completed a number of really interesting +projects built around [delta-rs](https://github.com/delta-io/delta-rs) and the +Databricks platform which we are thrilled to share with a broader audience. +In addition to the presentations listed below, a number of Scribd engineers who +are responsible for data and ML platform, machine learning systems, and more, +will be in attendance if you want to meet up and learn more about how Scribd +uses data and ML to change the way the world reads! + + +* [Christian Williams](https://github.com/xianwill) will be sharing some of the +work he has done developing +[kafka-delta-ingest](https://github.com/scribd/kafka-delta-ingest) in his talk: +**[Streaming Data into Delta Lake with Rust and +Kafka](https://databricks.com/dataaisummit/north-america-2022/agenda/?sessionid=1834)** +* [QP Hou](https://github.com/houqp), Scribd Emeritus, will be presenting on +his foundational work to ensure correctness within delta-rs during his session: +**[Ensuring Correct Distributed Writes to Delta Lake in Rust with Formal +Verification](https://databricks.com/dataaisummit/north-america-2022/agenda/?sessionid=1623)** +* [R Tyler Croy](https://github.com/rtyler) will be co-presenting with Gavin +Edgley from Databricks on the cost analysis work Scribd has done to efficiently +grow our data platform with **[Doubling the size of the data lake without doubling the +cost](https://databricks.com/dataaisummit/north-america-2022/agenda/?sessionid=2366)** + + +There are so many great sessions to watch in person or online during the event, +particularly around [Delta Lake](https://delta.io), which is one of our +favorite technologies and powers our entire data platform. We are also +expecting some great ML related talks as data and ML begin to overlap more and +more. We hope to see you there! + From c54a65233912f4b626000a170f58c38502820b46 Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Thu, 28 Apr 2022 09:43:56 -0700 Subject: [PATCH 06/55] Update the Gemfile to run on newer rubies --- Gemfile | 9 +- Gemfile.lock | 330 ++++++++++++++++++++++++++++----------------------- 2 files changed, 187 insertions(+), 152 deletions(-) diff --git a/Gemfile b/Gemfile index fd51732..f0f42ae 100644 --- a/Gemfile +++ b/Gemfile @@ -1,18 +1,21 @@ source "https://rubygems.org/" -gem "jekyll", "~> 3.8.5" +#gem "jekyll", "~> 4.0" gem 'kramdown' gem 'rouge' # If you want to use GitHub Pages, remove the "gem "jekyll"" above and # uncomment the line below. To upgrade, run `bundle update github-pages`. -gem "github-pages", "~> 201", group: :jekyll_plugins +gem "github-pages", group: :jekyll_plugins # If you have any plugins, put them here! group :jekyll_plugins do - gem "jekyll-feed", "~> 0.11" + gem "jekyll-feed" + gem 'jekyll-paginate' end +gem 'webrick' + # Windows and JRuby does not include zoneinfo files, so bundle the tzinfo-data gem # and associated library. install_if -> { RUBY_PLATFORM =~ %r!mingw|mswin|java! } do diff --git a/Gemfile.lock b/Gemfile.lock index b3bf972..fc02542 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,11 +1,12 @@ GEM remote: https://rubygems.org/ specs: - activesupport (4.2.11.1) - i18n (~> 0.7) + activesupport (6.0.4.8) + concurrent-ruby (~> 1.0, >= 1.0.2) + i18n (>= 0.7, < 2) minitest (~> 5.1) - thread_safe (~> 0.3, >= 0.3.4) tzinfo (~> 1.1) + zeitwerk (~> 2.2, >= 2.2.2) addressable (2.8.0) public_suffix (>= 2.0.2, < 5.0) coffee-script (2.4.1) @@ -13,216 +14,239 @@ GEM execjs coffee-script-source (1.11.1) colorator (1.1.0) - commonmarker (0.17.13) - ruby-enum (~> 0.5) - concurrent-ruby (1.1.5) - dnsruby (1.61.3) - addressable (~> 2.5) - em-websocket (0.5.1) + commonmarker (0.23.4) + concurrent-ruby (1.1.10) + dnsruby (1.61.9) + simpleidn (~> 0.1) + em-websocket (0.5.3) eventmachine (>= 0.12.9) - http_parser.rb (~> 0.6.0) - ethon (0.12.0) - ffi (>= 1.3.0) + http_parser.rb (~> 0) + ethon (0.15.0) + ffi (>= 1.15.0) eventmachine (1.2.7) - execjs (2.7.0) - faraday (0.17.0) + execjs (2.8.1) + faraday (1.10.0) + faraday-em_http (~> 1.0) + faraday-em_synchrony (~> 1.0) + faraday-excon (~> 1.1) + faraday-httpclient (~> 1.0) + faraday-multipart (~> 1.0) + faraday-net_http (~> 1.0) + faraday-net_http_persistent (~> 1.0) + faraday-patron (~> 1.0) + faraday-rack (~> 1.0) + faraday-retry (~> 1.0) + ruby2_keywords (>= 0.0.4) + faraday-em_http (1.0.0) + faraday-em_synchrony (1.0.0) + faraday-excon (1.1.0) + faraday-httpclient (1.0.1) + faraday-multipart (1.0.3) multipart-post (>= 1.2, < 3) - ffi (1.11.1) + faraday-net_http (1.0.1) + faraday-net_http_persistent (1.2.0) + faraday-patron (1.0.0) + faraday-rack (1.0.0) + faraday-retry (1.0.3) + ffi (1.15.5) forwardable-extended (2.6.0) gemoji (3.0.1) - github-pages (201) - activesupport (= 4.2.11.1) - github-pages-health-check (= 1.16.1) - jekyll (= 3.8.5) - jekyll-avatar (= 0.6.0) + github-pages (226) + github-pages-health-check (= 1.17.9) + jekyll (= 3.9.2) + jekyll-avatar (= 0.7.0) jekyll-coffeescript (= 1.1.1) - jekyll-commonmark-ghpages (= 0.1.6) + jekyll-commonmark-ghpages (= 0.2.0) jekyll-default-layout (= 0.1.4) - jekyll-feed (= 0.11.0) + jekyll-feed (= 0.15.1) jekyll-gist (= 1.5.0) - jekyll-github-metadata (= 2.12.1) - jekyll-mentions (= 1.4.1) - jekyll-optional-front-matter (= 0.3.0) + jekyll-github-metadata (= 2.13.0) + jekyll-include-cache (= 0.2.1) + jekyll-mentions (= 1.6.0) + jekyll-optional-front-matter (= 0.3.2) jekyll-paginate (= 1.1.0) - jekyll-readme-index (= 0.2.0) - jekyll-redirect-from (= 0.14.0) - jekyll-relative-links (= 0.6.0) - jekyll-remote-theme (= 0.4.0) + jekyll-readme-index (= 0.3.0) + jekyll-redirect-from (= 0.16.0) + jekyll-relative-links (= 0.6.1) + jekyll-remote-theme (= 0.4.3) jekyll-sass-converter (= 1.5.2) - jekyll-seo-tag (= 2.5.0) - jekyll-sitemap (= 1.2.0) - jekyll-swiss (= 0.4.0) - jekyll-theme-architect (= 0.1.1) - jekyll-theme-cayman (= 0.1.1) - jekyll-theme-dinky (= 0.1.1) - jekyll-theme-hacker (= 0.1.1) - jekyll-theme-leap-day (= 0.1.1) - jekyll-theme-merlot (= 0.1.1) - jekyll-theme-midnight (= 0.1.1) - jekyll-theme-minimal (= 0.1.1) - jekyll-theme-modernist (= 0.1.1) - jekyll-theme-primer (= 0.5.3) - jekyll-theme-slate (= 0.1.1) - jekyll-theme-tactile (= 0.1.1) - jekyll-theme-time-machine (= 0.1.1) - jekyll-titles-from-headings (= 0.5.1) - jemoji (= 0.10.2) - kramdown (= 1.17.0) - liquid (= 4.0.0) - listen (= 3.1.5) + jekyll-seo-tag (= 2.8.0) + jekyll-sitemap (= 1.4.0) + jekyll-swiss (= 1.0.0) + jekyll-theme-architect (= 0.2.0) + jekyll-theme-cayman (= 0.2.0) + jekyll-theme-dinky (= 0.2.0) + jekyll-theme-hacker (= 0.2.0) + jekyll-theme-leap-day (= 0.2.0) + jekyll-theme-merlot (= 0.2.0) + jekyll-theme-midnight (= 0.2.0) + jekyll-theme-minimal (= 0.2.0) + jekyll-theme-modernist (= 0.2.0) + jekyll-theme-primer (= 0.6.0) + jekyll-theme-slate (= 0.2.0) + jekyll-theme-tactile (= 0.2.0) + jekyll-theme-time-machine (= 0.2.0) + jekyll-titles-from-headings (= 0.5.3) + jemoji (= 0.12.0) + kramdown (= 2.3.2) + kramdown-parser-gfm (= 1.1.0) + liquid (= 4.0.3) mercenary (~> 0.3) - minima (= 2.5.0) - nokogiri (>= 1.10.4, < 2.0) - rouge (= 3.11.0) + minima (= 2.5.1) + nokogiri (>= 1.13.4, < 2.0) + rouge (= 3.26.0) terminal-table (~> 1.4) - github-pages-health-check (1.16.1) + github-pages-health-check (1.17.9) addressable (~> 2.3) dnsruby (~> 1.60) octokit (~> 4.0) - public_suffix (~> 3.0) + public_suffix (>= 3.0, < 5.0) typhoeus (~> 1.3) - html-pipeline (2.12.0) + html-pipeline (2.14.1) activesupport (>= 2) nokogiri (>= 1.4) - http_parser.rb (0.6.0) + http_parser.rb (0.8.0) i18n (0.9.5) concurrent-ruby (~> 1.0) - jekyll (3.8.5) + jekyll (3.9.2) addressable (~> 2.4) colorator (~> 1.0) em-websocket (~> 0.5) i18n (~> 0.7) jekyll-sass-converter (~> 1.0) jekyll-watch (~> 2.0) - kramdown (~> 1.14) + kramdown (>= 1.17, < 3) liquid (~> 4.0) mercenary (~> 0.3.3) pathutil (~> 0.9) rouge (>= 1.7, < 4) safe_yaml (~> 1.0) - jekyll-avatar (0.6.0) - jekyll (~> 3.0) + jekyll-avatar (0.7.0) + jekyll (>= 3.0, < 5.0) jekyll-coffeescript (1.1.1) coffee-script (~> 2.2) coffee-script-source (~> 1.11.1) - jekyll-commonmark (1.3.1) - commonmarker (~> 0.14) - jekyll (>= 3.7, < 5.0) - jekyll-commonmark-ghpages (0.1.6) - commonmarker (~> 0.17.6) - jekyll-commonmark (~> 1.2) + jekyll-commonmark (1.4.0) + commonmarker (~> 0.22) + jekyll-commonmark-ghpages (0.2.0) + commonmarker (~> 0.23.4) + jekyll (~> 3.9.0) + jekyll-commonmark (~> 1.4.0) rouge (>= 2.0, < 4.0) jekyll-default-layout (0.1.4) jekyll (~> 3.0) - jekyll-feed (0.11.0) - jekyll (~> 3.3) + jekyll-feed (0.15.1) + jekyll (>= 3.7, < 5.0) jekyll-gist (1.5.0) octokit (~> 4.2) - jekyll-github-metadata (2.12.1) - jekyll (~> 3.4) + jekyll-github-metadata (2.13.0) + jekyll (>= 3.4, < 5.0) octokit (~> 4.0, != 4.4.0) - jekyll-mentions (1.4.1) + jekyll-include-cache (0.2.1) + jekyll (>= 3.7, < 5.0) + jekyll-mentions (1.6.0) html-pipeline (~> 2.3) - jekyll (~> 3.0) - jekyll-optional-front-matter (0.3.0) - jekyll (~> 3.0) + jekyll (>= 3.7, < 5.0) + jekyll-optional-front-matter (0.3.2) + jekyll (>= 3.0, < 5.0) jekyll-paginate (1.1.0) - jekyll-readme-index (0.2.0) - jekyll (~> 3.0) - jekyll-redirect-from (0.14.0) - jekyll (~> 3.3) - jekyll-relative-links (0.6.0) - jekyll (~> 3.3) - jekyll-remote-theme (0.4.0) + jekyll-readme-index (0.3.0) + jekyll (>= 3.0, < 5.0) + jekyll-redirect-from (0.16.0) + jekyll (>= 3.3, < 5.0) + jekyll-relative-links (0.6.1) + jekyll (>= 3.3, < 5.0) + jekyll-remote-theme (0.4.3) addressable (~> 2.0) - jekyll (~> 3.5) - rubyzip (>= 1.2.1, < 3.0) + jekyll (>= 3.5, < 5.0) + jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0) + rubyzip (>= 1.3.0, < 3.0) jekyll-sass-converter (1.5.2) sass (~> 3.4) - jekyll-seo-tag (2.5.0) - jekyll (~> 3.3) - jekyll-sitemap (1.2.0) - jekyll (~> 3.3) - jekyll-swiss (0.4.0) - jekyll-theme-architect (0.1.1) - jekyll (~> 3.5) + jekyll-seo-tag (2.8.0) + jekyll (>= 3.8, < 5.0) + jekyll-sitemap (1.4.0) + jekyll (>= 3.7, < 5.0) + jekyll-swiss (1.0.0) + jekyll-theme-architect (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-cayman (0.1.1) - jekyll (~> 3.5) + jekyll-theme-cayman (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-dinky (0.1.1) - jekyll (~> 3.5) + jekyll-theme-dinky (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-hacker (0.1.1) - jekyll (~> 3.5) + jekyll-theme-hacker (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-leap-day (0.1.1) - jekyll (~> 3.5) + jekyll-theme-leap-day (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-merlot (0.1.1) - jekyll (~> 3.5) + jekyll-theme-merlot (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-midnight (0.1.1) - jekyll (~> 3.5) + jekyll-theme-midnight (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-minimal (0.1.1) - jekyll (~> 3.5) + jekyll-theme-minimal (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-modernist (0.1.1) - jekyll (~> 3.5) + jekyll-theme-modernist (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-primer (0.5.3) - jekyll (~> 3.5) + jekyll-theme-primer (0.6.0) + jekyll (> 3.5, < 5.0) jekyll-github-metadata (~> 2.9) jekyll-seo-tag (~> 2.0) - jekyll-theme-slate (0.1.1) - jekyll (~> 3.5) + jekyll-theme-slate (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-tactile (0.1.1) - jekyll (~> 3.5) + jekyll-theme-tactile (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-time-machine (0.1.1) - jekyll (~> 3.5) + jekyll-theme-time-machine (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-titles-from-headings (0.5.1) - jekyll (~> 3.3) + jekyll-titles-from-headings (0.5.3) + jekyll (>= 3.3, < 5.0) jekyll-watch (2.2.1) listen (~> 3.0) - jemoji (0.10.2) + jemoji (0.12.0) gemoji (~> 3.0) html-pipeline (~> 2.2) - jekyll (~> 3.0) - kramdown (1.17.0) - liquid (4.0.0) - listen (3.1.5) - rb-fsevent (~> 0.9, >= 0.9.4) - rb-inotify (~> 0.9, >= 0.9.7) - ruby_dep (~> 1.2) + jekyll (>= 3.0, < 5.0) + kramdown (2.3.2) + rexml + kramdown-parser-gfm (1.1.0) + kramdown (~> 2.0) + liquid (4.0.3) + listen (3.7.1) + rb-fsevent (~> 0.10, >= 0.10.3) + rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.3.6) - mini_portile2 (2.8.0) - minima (2.5.0) - jekyll (~> 3.5) + minima (2.5.1) + jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) - minitest (5.12.2) + minitest (5.15.0) multipart-post (2.1.1) - nokogiri (1.13.4) - mini_portile2 (~> 2.8.0) + nokogiri (1.13.4-x86_64-linux) racc (~> 1.4) - octokit (4.14.0) + octokit (4.22.0) + faraday (>= 0.9) sawyer (~> 0.8.0, >= 0.5.3) pathutil (0.16.2) forwardable-extended (~> 2.6) - public_suffix (3.1.1) + public_suffix (4.0.7) racc (1.6.0) - rb-fsevent (0.10.3) - rb-inotify (0.10.0) + rb-fsevent (0.11.1) + rb-inotify (0.10.1) ffi (~> 1.0) - rouge (3.11.0) - ruby-enum (0.7.2) - i18n - ruby_dep (1.5.0) - rubyzip (2.0.0) + rexml (3.2.5) + rouge (3.26.0) + ruby2_keywords (0.0.5) + rubyzip (2.3.2) safe_yaml (1.0.5) sass (3.7.4) sass-listen (~> 4.0.0) @@ -232,30 +256,38 @@ GEM sawyer (0.8.2) addressable (>= 2.3.5) faraday (> 0.8, < 2.0) + simpleidn (0.2.1) + unf (~> 0.1.4) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) thread_safe (0.3.6) - typhoeus (1.3.1) + typhoeus (1.4.0) ethon (>= 0.9.0) - tzinfo (1.2.5) + tzinfo (1.2.9) thread_safe (~> 0.1) - tzinfo-data (1.2019.3) + tzinfo-data (1.2022.1) tzinfo (>= 1.0.0) - unicode-display_width (1.6.0) + unf (0.1.4) + unf_ext + unf_ext (0.0.8.1) + unicode-display_width (1.8.0) wdm (0.1.1) + webrick (1.7.0) + zeitwerk (2.5.4) PLATFORMS - ruby + x86_64-linux DEPENDENCIES - github-pages (~> 201) - jekyll (~> 3.8.5) - jekyll-feed (~> 0.11) + github-pages + jekyll-feed + jekyll-paginate kramdown rouge tzinfo (~> 1.2) tzinfo-data wdm (~> 0.1.1) + webrick BUNDLED WITH - 2.0.2 + 2.3.8 From 2708b23ce6ec922810c1205ef546816e63b4da5e Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Fri, 29 Apr 2022 14:09:57 -0700 Subject: [PATCH 07/55] Update _posts/2022-04-28-data-ai-summit-2022.md Co-authored-by: Jim Park --- _posts/2022-04-28-data-ai-summit-2022.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_posts/2022-04-28-data-ai-summit-2022.md b/_posts/2022-04-28-data-ai-summit-2022.md index 7da9c30..8916901 100644 --- a/_posts/2022-04-28-data-ai-summit-2022.md +++ b/_posts/2022-04-28-data-ai-summit-2022.md @@ -1,6 +1,6 @@ --- layout: post -title: "We're presenting at Data and AI Summit 2022" +title: "Scribd is presenting at Data and AI Summit 2022" team: Core Platform author: rtyler tags: From 9be7e53be18faf3b0dd868daac46821790bfa0bc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 19 May 2022 03:48:12 +0000 Subject: [PATCH 08/55] Bump nokogiri from 1.13.4 to 1.13.6 Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.13.4 to 1.13.6. - [Release notes](https://github.com/sparklemotion/nokogiri/releases) - [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md) - [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.13.4...v1.13.6) --- updated-dependencies: - dependency-name: nokogiri dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index fc02542..92ad837 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -231,7 +231,7 @@ GEM jekyll-seo-tag (~> 2.1) minitest (5.15.0) multipart-post (2.1.1) - nokogiri (1.13.4-x86_64-linux) + nokogiri (1.13.6-x86_64-linux) racc (~> 1.4) octokit (4.22.0) faraday (>= 0.9) From 60eaed031d356087800916feab9d394eb348dd7a Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Tue, 28 Jun 2022 14:23:10 -0700 Subject: [PATCH 09/55] Add Hamilton's post about using Databricks SQL Serverless --- _posts/2022-06-28-databricks-serverless.md | 58 ++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 _posts/2022-06-28-databricks-serverless.md diff --git a/_posts/2022-06-28-databricks-serverless.md b/_posts/2022-06-28-databricks-serverless.md new file mode 100644 index 0000000..f851133 --- /dev/null +++ b/_posts/2022-06-28-databricks-serverless.md @@ -0,0 +1,58 @@ +---- +layout: post +title: "Accelerating Looker with Databricks SQL Serverless" +tags: +- looker +- databricks +- featured +team: Core Platform +author: hamiltonh +---- + +We recently migrated Looker to a Databricks SQL Serverless, improving our +infrastructure cost and reducing the footprint of infrastructure we need to +worry about! “Databricks SQL” which provides a single load balanced Endpoint +for executing Spark SQL queries across multiple Spark clusters behind the +scenes. “Serverless” is an evolution of that concept, rather than running a SQL +Endpoint in our AWS infrastructure, the entirety of execution happens on the +Databricks side. With a much simpler and faster interface, queries executed in +Looker now return results much faster to our users than ever before! + +When we originally provisioned our “Databricks SQL” endpoints, we worked +together with our colleagues at Databricks to ensure [the terraform provider +for Databricks](https://github.com/databricks/terraform-provider-databricks) is +ready for production usage, which as of today is Generally Available. That +original foundation in Terraform allowed us to more easily adopt SQL Serverless +once it was made available to us. + +```hcl +resource "databricks_sql_endpoint" "endpoint" { + name = "Looker Serverless" + # ... + enable_serverless_compute = true + # ... +} +``` + +The feature was literally brand new so there were a few integration hurdles we +had to work through with our colleagues at Databricks, but we got things up and +running in short order. By adopting SQL Serverless, we could avoid setting up +special networking, IAM roles, and other resources within our own AWS account, +we can instead rely on pre-provisioned compute resources within Databricks' own +infrastructure. No more headache of ensuring all of the required infra is in +place and setup correctly! + +The switch to Serverless reduced our infra configuration and management +footprint, which by itself is an improvement. We also noticed a significant +reduction in cold start times for the SQL Serverless Endpoint compared to the +standard SQL Endpoint. The faster start-up times meant we could configure even +lower auto-terminate times on the endpoint, savings us even more on +unproductive and idle cluster costs. + +On the Looker side there really wasn’t any difference in the connection +configuration other than a URL change. In the end, after some preparation work +a simple 5 minute change in Looker, and a simple 5 minute change in Terraform +switched everything over to Databricks SQL Serverless, and we were ready to +rock! Our BI team is very happy with the performance, especially on cold start +queries. Our CFO is happy about reducing infrastructure costs. And I’m happy +about simpler infrastructure! From 85421e2b9e0941474afb2769e0dd8f3beb15945b Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Tue, 28 Jun 2022 15:40:53 -0700 Subject: [PATCH 10/55] I think I made a boo boo --- _posts/2022-06-28-databricks-serverless.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_posts/2022-06-28-databricks-serverless.md b/_posts/2022-06-28-databricks-serverless.md index f851133..c3956a2 100644 --- a/_posts/2022-06-28-databricks-serverless.md +++ b/_posts/2022-06-28-databricks-serverless.md @@ -1,4 +1,4 @@ ----- +--- layout: post title: "Accelerating Looker with Databricks SQL Serverless" tags: @@ -7,7 +7,7 @@ tags: - featured team: Core Platform author: hamiltonh ----- +--- We recently migrated Looker to a Databricks SQL Serverless, improving our infrastructure cost and reducing the footprint of infrastructure we need to From ad1719248a05024fd9820d8c7ba82630829f2bd8 Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Thu, 21 Jul 2022 08:41:16 -0700 Subject: [PATCH 11/55] Blog post highlighting the recordings of our Data and AI Summit talks --- _posts/2022-07-21-data-ai-summit-videos.md | 47 ++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 _posts/2022-07-21-data-ai-summit-videos.md diff --git a/_posts/2022-07-21-data-ai-summit-videos.md b/_posts/2022-07-21-data-ai-summit-videos.md new file mode 100644 index 0000000..9755ba5 --- /dev/null +++ b/_posts/2022-07-21-data-ai-summit-videos.md @@ -0,0 +1,47 @@ +--- +layout: post +title: "Data and AI Summit Wrap-up" +team: Core Platform +author: rtyler +tags: +- databricks +- kafka +- deltalake +- featured +--- + +We brought a whole team to San Francisco to present and attend this year's Data and +AI Summit, and it was a blast! +I +would consider the event a success both in the attendance to the Scribd hosted +talks and the number of talks which discussed patterns we have adopted in our +own data and ML platform. +The three talks I [wrote about +previously](/blog/2022/data-ai-summit-2022.html) were well received and have +since been posted to YouTube along with _hundreds_ of other talks. + +* [Christian Williams](https://github.com/xianwill) shared some of the +work he has done developing +[kafka-delta-ingest](https://github.com/scribd/kafka-delta-ingest) in his talk: +**[Streaming Data into Delta Lake with Rust and +Kafka](https://www.youtube.com/watch?v=do4jsxeKfd4&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=195) +* [QP Hou](https://github.com/houqp), Scribd Emeritus, presented on +his foundational work to ensure correctness within delta-rs during his session: +**[Ensuring Correct Distributed Writes to Delta Lake in Rust with Formal +Verification](https://www.youtube.com/watch?v=ABoCnrVWCKY&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=112) +* [R Tyler Croy](https://github.com/rtyler) co-presented with Gavin +Edgley from Databricks on the cost analysis work Scribd has done to efficiently +grow our data platform with **[Doubling the size of the data lake without doubling the +cost]( +https://www.youtube.com/watch?v=9QDRD0PzqCE&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=122) + +Members of the Scribd team participated in a panel to discuss the past, +present, and future of Delta Lake on the expo floor. We also took advantage of +the time to have multiple discussions with our colleagues at Databricks about +their product and engineering roadmap, and where we can work together to +improve the future of Delta Lake, Unity catalog, and more. + +For those working in the data, ML, or infrastructure space, there are a lot of +_great_ talks available online from the event, which I highly recommend +checking out. Data and AI Summit is a great event for leaders in the industry +to get together, so we'll definitely be back next year! From 24c9a94559a26f4dc246b0fee2708c0ea01908fe Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 22 Jul 2022 09:31:02 +0000 Subject: [PATCH 12/55] Bump tzinfo from 1.2.9 to 1.2.10 Bumps [tzinfo](https://github.com/tzinfo/tzinfo) from 1.2.9 to 1.2.10. - [Release notes](https://github.com/tzinfo/tzinfo/releases) - [Changelog](https://github.com/tzinfo/tzinfo/blob/master/CHANGES.md) - [Commits](https://github.com/tzinfo/tzinfo/compare/v1.2.9...v1.2.10) --- updated-dependencies: - dependency-name: tzinfo dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index 92ad837..d344815 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -263,7 +263,7 @@ GEM thread_safe (0.3.6) typhoeus (1.4.0) ethon (>= 0.9.0) - tzinfo (1.2.9) + tzinfo (1.2.10) thread_safe (~> 0.1) tzinfo-data (1.2022.1) tzinfo (>= 1.0.0) From a966efd7bb0382d2680cd8c2429db72e24d1f1a6 Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Fri, 22 Jul 2022 10:08:28 -0700 Subject: [PATCH 13/55] Update _posts/2022-07-21-data-ai-summit-videos.md Co-authored-by: Jim Park --- _posts/2022-07-21-data-ai-summit-videos.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/_posts/2022-07-21-data-ai-summit-videos.md b/_posts/2022-07-21-data-ai-summit-videos.md index 9755ba5..bbceb62 100644 --- a/_posts/2022-07-21-data-ai-summit-videos.md +++ b/_posts/2022-07-21-data-ai-summit-videos.md @@ -23,8 +23,7 @@ since been posted to YouTube along with _hundreds_ of other talks. * [Christian Williams](https://github.com/xianwill) shared some of the work he has done developing [kafka-delta-ingest](https://github.com/scribd/kafka-delta-ingest) in his talk: -**[Streaming Data into Delta Lake with Rust and -Kafka](https://www.youtube.com/watch?v=do4jsxeKfd4&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=195) +[![Streaming Data into Delta Lake with Rust and Kafka](https://img.youtube.com/vi/do4jsxeKfd4/hqdefault.jpg)](https://www.youtube.com/watch?v=do4jsxeKfd4&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=195) * [QP Hou](https://github.com/houqp), Scribd Emeritus, presented on his foundational work to ensure correctness within delta-rs during his session: **[Ensuring Correct Distributed Writes to Delta Lake in Rust with Formal From 224c8fac751e5a65ab5100f387ef13e2e63aab88 Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Fri, 22 Jul 2022 10:08:32 -0700 Subject: [PATCH 14/55] Update _posts/2022-07-21-data-ai-summit-videos.md Co-authored-by: Jim Park --- _posts/2022-07-21-data-ai-summit-videos.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_posts/2022-07-21-data-ai-summit-videos.md b/_posts/2022-07-21-data-ai-summit-videos.md index bbceb62..a38b4aa 100644 --- a/_posts/2022-07-21-data-ai-summit-videos.md +++ b/_posts/2022-07-21-data-ai-summit-videos.md @@ -26,8 +26,8 @@ work he has done developing [![Streaming Data into Delta Lake with Rust and Kafka](https://img.youtube.com/vi/do4jsxeKfd4/hqdefault.jpg)](https://www.youtube.com/watch?v=do4jsxeKfd4&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=195) * [QP Hou](https://github.com/houqp), Scribd Emeritus, presented on his foundational work to ensure correctness within delta-rs during his session: -**[Ensuring Correct Distributed Writes to Delta Lake in Rust with Formal -Verification](https://www.youtube.com/watch?v=ABoCnrVWCKY&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=112) +[![Ensuring Correct Distributed Writes to Delta Lake in Rust with Formal +Verification](https://img.youtube.com/vi/ABoCnrVWCKY/hqdefault.jpg)](https://www.youtube.com/watch?v=ABoCnrVWCKY&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=112) * [R Tyler Croy](https://github.com/rtyler) co-presented with Gavin Edgley from Databricks on the cost analysis work Scribd has done to efficiently grow our data platform with **[Doubling the size of the data lake without doubling the From b0b4fc8c73d3a1433d63926694f74903239f7b5d Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Fri, 22 Jul 2022 10:08:37 -0700 Subject: [PATCH 15/55] Update _posts/2022-07-21-data-ai-summit-videos.md Co-authored-by: Jim Park --- _posts/2022-07-21-data-ai-summit-videos.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/_posts/2022-07-21-data-ai-summit-videos.md b/_posts/2022-07-21-data-ai-summit-videos.md index a38b4aa..828f149 100644 --- a/_posts/2022-07-21-data-ai-summit-videos.md +++ b/_posts/2022-07-21-data-ai-summit-videos.md @@ -30,9 +30,8 @@ his foundational work to ensure correctness within delta-rs during his session: Verification](https://img.youtube.com/vi/ABoCnrVWCKY/hqdefault.jpg)](https://www.youtube.com/watch?v=ABoCnrVWCKY&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=112) * [R Tyler Croy](https://github.com/rtyler) co-presented with Gavin Edgley from Databricks on the cost analysis work Scribd has done to efficiently -grow our data platform with **[Doubling the size of the data lake without doubling the -cost]( -https://www.youtube.com/watch?v=9QDRD0PzqCE&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=122) +grow our data platform with: +[![Doubling the size of the data lake without doubling the cost](https://img.youtube.com/vi/9QDRD0PzqCE/hqdefault.jpg)](https://www.youtube.com/watch?v=9QDRD0PzqCE&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=122) Members of the Scribd team participated in a panel to discuss the past, present, and future of Delta Lake on the expo floor. We also took advantage of From 423741b527b86c631c07a292ac7473bcea00d78a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 22 Sep 2022 02:15:59 +0000 Subject: [PATCH 16/55] Bump commonmarker from 0.23.4 to 0.23.6 Bumps [commonmarker](https://github.com/gjtorikian/commonmarker) from 0.23.4 to 0.23.6. - [Release notes](https://github.com/gjtorikian/commonmarker/releases) - [Changelog](https://github.com/gjtorikian/commonmarker/blob/main/CHANGELOG.md) - [Commits](https://github.com/gjtorikian/commonmarker/compare/v0.23.4...v0.23.6) --- updated-dependencies: - dependency-name: commonmarker dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index d344815..55e5b4d 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -14,7 +14,7 @@ GEM execjs coffee-script-source (1.11.1) colorator (1.1.0) - commonmarker (0.23.4) + commonmarker (0.23.6) concurrent-ruby (1.1.10) dnsruby (1.61.9) simpleidn (~> 0.1) From 6dabe56d82ea6004b2511a7379b702da56b2b286 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 21 Oct 2022 04:03:11 +0000 Subject: [PATCH 17/55] Bump nokogiri from 1.13.6 to 1.13.9 Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.13.6 to 1.13.9. - [Release notes](https://github.com/sparklemotion/nokogiri/releases) - [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md) - [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.13.6...v1.13.9) --- updated-dependencies: - dependency-name: nokogiri dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index d344815..c57d9a2 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -231,7 +231,7 @@ GEM jekyll-seo-tag (~> 2.1) minitest (5.15.0) multipart-post (2.1.1) - nokogiri (1.13.6-x86_64-linux) + nokogiri (1.13.9-x86_64-linux) racc (~> 1.4) octokit (4.22.0) faraday (>= 0.9) From 5461549dff9deab5799f4756aa802c4e73722a37 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 8 Dec 2022 04:20:26 +0000 Subject: [PATCH 18/55] Bump nokogiri from 1.13.9 to 1.13.10 Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.13.9 to 1.13.10. - [Release notes](https://github.com/sparklemotion/nokogiri/releases) - [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md) - [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.13.9...v1.13.10) --- updated-dependencies: - dependency-name: nokogiri dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Gemfile.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index a780dee..473b588 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -231,7 +231,7 @@ GEM jekyll-seo-tag (~> 2.1) minitest (5.15.0) multipart-post (2.1.1) - nokogiri (1.13.9-x86_64-linux) + nokogiri (1.13.10-x86_64-linux) racc (~> 1.4) octokit (4.22.0) faraday (>= 0.9) @@ -239,7 +239,7 @@ GEM pathutil (0.16.2) forwardable-extended (~> 2.6) public_suffix (4.0.7) - racc (1.6.0) + racc (1.6.1) rb-fsevent (0.11.1) rb-inotify (0.10.1) ffi (~> 1.0) From 14bd312e5d01c568ac3db9f7884348ccb5a1b6a3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 25 Jan 2023 04:33:46 +0000 Subject: [PATCH 19/55] Bump commonmarker from 0.23.6 to 0.23.7 Bumps [commonmarker](https://github.com/gjtorikian/commonmarker) from 0.23.6 to 0.23.7. - [Release notes](https://github.com/gjtorikian/commonmarker/releases) - [Changelog](https://github.com/gjtorikian/commonmarker/blob/main/CHANGELOG.md) - [Commits](https://github.com/gjtorikian/commonmarker/compare/v0.23.6...v0.23.7) --- updated-dependencies: - dependency-name: commonmarker dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index 473b588..7a6b0d1 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -14,7 +14,7 @@ GEM execjs coffee-script-source (1.11.1) colorator (1.1.0) - commonmarker (0.23.6) + commonmarker (0.23.7) concurrent-ruby (1.1.10) dnsruby (1.61.9) simpleidn (~> 0.1) From ce9f12aa53380ab4db2908ccbda3559d41c3e82c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 31 Jan 2023 03:58:54 +0000 Subject: [PATCH 20/55] Bump activesupport from 6.0.4.8 to 6.0.6.1 Bumps [activesupport](https://github.com/rails/rails) from 6.0.4.8 to 6.0.6.1. - [Release notes](https://github.com/rails/rails/releases) - [Changelog](https://github.com/rails/rails/blob/v7.0.4.2/activesupport/CHANGELOG.md) - [Commits](https://github.com/rails/rails/compare/v6.0.4.8...v6.0.6.1) --- updated-dependencies: - dependency-name: activesupport dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Gemfile.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 7a6b0d1..8b46747 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,7 +1,7 @@ GEM remote: https://rubygems.org/ specs: - activesupport (6.0.4.8) + activesupport (6.0.6.1) concurrent-ruby (~> 1.0, >= 1.0.2) i18n (>= 0.7, < 2) minitest (~> 5.1) @@ -15,7 +15,7 @@ GEM coffee-script-source (1.11.1) colorator (1.1.0) commonmarker (0.23.7) - concurrent-ruby (1.1.10) + concurrent-ruby (1.2.0) dnsruby (1.61.9) simpleidn (~> 0.1) em-websocket (0.5.3) @@ -229,7 +229,7 @@ GEM jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) - minitest (5.15.0) + minitest (5.17.0) multipart-post (2.1.1) nokogiri (1.13.10-x86_64-linux) racc (~> 1.4) @@ -273,7 +273,7 @@ GEM unicode-display_width (1.8.0) wdm (0.1.1) webrick (1.7.0) - zeitwerk (2.5.4) + zeitwerk (2.6.6) PLATFORMS x86_64-linux From 3ae17708a8c52edaa6ade391deb67340c44ec847 Mon Sep 17 00:00:00 2001 From: canefeoglu <592148+canefeoglu@users.noreply.github.com> Date: Mon, 20 Mar 2023 13:36:17 -0700 Subject: [PATCH 21/55] update databricks sql warehouse naming endpoint => warehouse --- _posts/2022-06-28-databricks-serverless.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/_posts/2022-06-28-databricks-serverless.md b/_posts/2022-06-28-databricks-serverless.md index c3956a2..1007238 100644 --- a/_posts/2022-06-28-databricks-serverless.md +++ b/_posts/2022-06-28-databricks-serverless.md @@ -11,14 +11,14 @@ author: hamiltonh We recently migrated Looker to a Databricks SQL Serverless, improving our infrastructure cost and reducing the footprint of infrastructure we need to -worry about! “Databricks SQL” which provides a single load balanced Endpoint +worry about! “Databricks SQL” which provides a single load balanced Warehouse for executing Spark SQL queries across multiple Spark clusters behind the scenes. “Serverless” is an evolution of that concept, rather than running a SQL -Endpoint in our AWS infrastructure, the entirety of execution happens on the +Warehouse in our AWS infrastructure, the entirety of execution happens on the Databricks side. With a much simpler and faster interface, queries executed in Looker now return results much faster to our users than ever before! -When we originally provisioned our “Databricks SQL” endpoints, we worked +When we originally provisioned our “Databricks SQL” warehouses, we worked together with our colleagues at Databricks to ensure [the terraform provider for Databricks](https://github.com/databricks/terraform-provider-databricks) is ready for production usage, which as of today is Generally Available. That @@ -26,7 +26,7 @@ original foundation in Terraform allowed us to more easily adopt SQL Serverless once it was made available to us. ```hcl -resource "databricks_sql_endpoint" "endpoint" { +resource "databricks_sql_warehouse" "warehouse" { name = "Looker Serverless" # ... enable_serverless_compute = true @@ -44,9 +44,9 @@ place and setup correctly! The switch to Serverless reduced our infra configuration and management footprint, which by itself is an improvement. We also noticed a significant -reduction in cold start times for the SQL Serverless Endpoint compared to the -standard SQL Endpoint. The faster start-up times meant we could configure even -lower auto-terminate times on the endpoint, savings us even more on +reduction in cold start times for the SQL Serverless Warehouse compared to the +standard SQL Warehouse. The faster start-up times meant we could configure even +lower auto-terminate times on the warehouse, savings us even more on unproductive and idle cluster costs. On the Looker side there really wasn’t any difference in the connection From c57f94025f08dfda77a4a54f2e5eaf126eb3499f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 12 Apr 2023 05:30:51 +0000 Subject: [PATCH 22/55] Bump commonmarker from 0.23.7 to 0.23.9 Bumps [commonmarker](https://github.com/gjtorikian/commonmarker) from 0.23.7 to 0.23.9. - [Release notes](https://github.com/gjtorikian/commonmarker/releases) - [Changelog](https://github.com/gjtorikian/commonmarker/blob/main/CHANGELOG.md) - [Commits](https://github.com/gjtorikian/commonmarker/compare/v0.23.7...v0.23.9) --- updated-dependencies: - dependency-name: commonmarker dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index 8b46747..ba1e91b 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -14,7 +14,7 @@ GEM execjs coffee-script-source (1.11.1) colorator (1.1.0) - commonmarker (0.23.7) + commonmarker (0.23.9) concurrent-ruby (1.2.0) dnsruby (1.61.9) simpleidn (~> 0.1) From d6a22c05adfa8f6e7faf087de8726a089cd6a0d6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 12 Apr 2023 05:55:25 +0000 Subject: [PATCH 23/55] Bump nokogiri from 1.13.10 to 1.14.3 Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.13.10 to 1.14.3. - [Release notes](https://github.com/sparklemotion/nokogiri/releases) - [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md) - [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.13.10...v1.14.3) --- updated-dependencies: - dependency-name: nokogiri dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Gemfile.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 8b46747..e41bb8f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -231,7 +231,7 @@ GEM jekyll-seo-tag (~> 2.1) minitest (5.17.0) multipart-post (2.1.1) - nokogiri (1.13.10-x86_64-linux) + nokogiri (1.14.3-x86_64-linux) racc (~> 1.4) octokit (4.22.0) faraday (>= 0.9) @@ -239,7 +239,7 @@ GEM pathutil (0.16.2) forwardable-extended (~> 2.6) public_suffix (4.0.7) - racc (1.6.1) + racc (1.6.2) rb-fsevent (0.11.1) rb-inotify (0.10.1) ffi (~> 1.0) From d3b168c67c03b9275ea8a342cb11df996b4e1bd6 Mon Sep 17 00:00:00 2001 From: John Date: Tue, 23 May 2023 10:27:03 -0700 Subject: [PATCH 24/55] Update our mission in home.html We have a new mission. Let's make sure that is reflected on our blog. --- _layouts/home.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_layouts/home.html b/_layouts/home.html index b92aa95..3439c1f 100644 --- a/_layouts/home.html +++ b/_layouts/home.html @@ -21,7 +21,7 @@

Help us build our next project.

-

We're on a mission to change the way the world reads. That's an ambitious task, and we need ambitious people to get us there. See what positions are available and start your next chapter today.

+

We're on a mission to build the largest and most accessible library connecting storytellers with their audience. That's an ambitious task, and we need ambitious people to get us there. See what positions are available and start your next chapter today.

All Positions From ab73c79cdaae6f4829fc422f973081bc822be5b6 Mon Sep 17 00:00:00 2001 From: John Date: Tue, 23 May 2023 10:29:17 -0700 Subject: [PATCH 25/55] Updated mission statement on careers.html --- careers.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/careers.html b/careers.html index 1965edf..44d5367 100644 --- a/careers.html +++ b/careers.html @@ -18,7 +18,7 @@ alt="two people sitting around a sofa reading on a computer and tablet">
-

Help us change the way the world reads.

+

Help us build the largest and most accessible library connecting storytellers with their audience.

Our readers are on a mission to become their best selves, and so are we. We’re not afraid to take risks because we know that — win or lose — we’ll learn from them.

If you’re a talented team player and want to work somewhere where your input matters, we’d love to talk with you.

From 3de6a8cfbfb49d8326f119dbe4619d03a498b3c2 Mon Sep 17 00:00:00 2001 From: Marquos Zaki Date: Thu, 22 Jun 2023 00:00:18 +0300 Subject: [PATCH 26/55] mend SEC-1195 Updated panther post since we no longer use it --- _posts/2020-04-29-monitoring-aws-with-panther.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/_posts/2020-04-29-monitoring-aws-with-panther.md b/_posts/2020-04-29-monitoring-aws-with-panther.md index 6866b4d..da5ceec 100644 --- a/_posts/2020-04-29-monitoring-aws-with-panther.md +++ b/_posts/2020-04-29-monitoring-aws-with-panther.md @@ -5,10 +5,13 @@ tags: - monitoring - aws - featured +- archived team: Security Engineering author: paha --- +***NOTE***: *Scribd’s security infrastructure has since evolved away from using Panther* + Before widespread cloud usage, it was uncommon for one person to be present for the entire datacenter development lifecycle. Very few people knew how to design and build a datacenter from scratch while ensuring appropriate security configuration settings were set, on top of rigging up monitoring. It was even more uncommon for non-sysadmins to have any involvement in data center infrastructure construction or ongoing refinement. The cloud is very different. It only takes seconds to create an entire infrastructure from a template. And even developers are doing it! The monitoring challenges for such a scenario are significant. There aren't necessarily "more" monitoring data points, but the speed with which infrastructure can be created tends to result in infrastructure getting way out over its skis with respect to monitoring. Furthermore, since many barriers to entry for doing stupid things have been lowered to the point of non-existence, monitoring is the last great hope of maintaining control over a cloud environment. While access controls can still provide some guardrails, the flexibility that all engineers need to do their jobs requires that they have the ability to do "dangerous" things that they've never had to do before. The true definition of "full stack" has expanded. From 61ef0ae8f8bde8b21e54d840b12d59217e112432 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 8 Aug 2023 20:25:34 +0000 Subject: [PATCH 27/55] Bump commonmarker from 0.23.9 to 0.23.10 Bumps [commonmarker](https://github.com/gjtorikian/commonmarker) from 0.23.9 to 0.23.10. - [Release notes](https://github.com/gjtorikian/commonmarker/releases) - [Changelog](https://github.com/gjtorikian/commonmarker/blob/v0.23.10/CHANGELOG.md) - [Commits](https://github.com/gjtorikian/commonmarker/compare/v0.23.9...v0.23.10) --- updated-dependencies: - dependency-name: commonmarker dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index 1edeb31..1a9ad5f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -14,7 +14,7 @@ GEM execjs coffee-script-source (1.11.1) colorator (1.1.0) - commonmarker (0.23.9) + commonmarker (0.23.10) concurrent-ruby (1.2.0) dnsruby (1.61.9) simpleidn (~> 0.1) From 7551e7311eae90b0d93f07b68f6861bb53b14188 Mon Sep 17 00:00:00 2001 From: ben Date: Thu, 1 Feb 2024 10:49:24 -0800 Subject: [PATCH 28/55] added evolution of ml platform --- _posts/2024-02-01-evolution-of-mlplatform.md | 178 +++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 _posts/2024-02-01-evolution-of-mlplatform.md diff --git a/_posts/2024-02-01-evolution-of-mlplatform.md b/_posts/2024-02-01-evolution-of-mlplatform.md new file mode 100644 index 0000000..fa11b2d --- /dev/null +++ b/_posts/2024-02-01-evolution-of-mlplatform.md @@ -0,0 +1,178 @@ +--- +layout: post title: "The Evolution of the Machine Learning Platform" +team: Machine Learning Platform +author: bshaw +tags: +- ml +- mlops +- devops +- platform +--- + +Technical Debt is not unique to Software Engineering and is a concept applicable to production Machine Learning (ML) at scale. Machine Learning Platforms (ML Platforms) have the potential to be a key component to achieving production ML at scale without large technical debt, yet ML Platforms are not often well understood. This document outlines the key concepts and paradigm shifts that led to the conceptualization of ML Platforms and how ML Platforms can act as a key to unlocking Development Velocity without Technical debt. + +* 1 [Technical Debt and development velocity defined](#Technical-Debt-and-development-velocity-defined) + * 1.1 [Development Velocity](#Development-Velocity) + * 1.2 [Technical Debt](#Technical-Debt) + * 1.3 [Technical Debt in Machine Learning](#Technical-Debt-in-Machine-Learning) +* 2 [The Evolution Of ML Platforms](#The-Evolution-Of-ML-Platforms) + * 2.1 [DevOps -- The paradigm shift that led the way](#DevOps----The-paradigm-shift-that-led-the-way) + * 2.2 [Platforms -- Reducing Cognitive Load](#Platforms----Reducing-Cognitive-Load) + * 2.3 [ML Ops -- Reducing technical debt of machine learning](#ML-Ops----Reducing-technical-debt-of-machine-learning) +* 3 [The Rise of Machine Learning Platform](#The-Rise-of-Machine-Learning-Platform) + * 3.1 [Benefits to the Organization](#Benefits-to-the-Organization) +* 4 [References](#References) + +Technical Debt and development velocity defined +----------------------------------------------- + +### Development Velocity + +Machine learning development velocity refers to the speed and efficiency at which machine learning (ML) projects progress from the initial concept to deployment and maintenance. It encompasses the entire lifecycle of a machine learning project, from data collection and preprocessing to model training, evaluation, deployment, and ongoing optimization. In platform engineering this is often referred to as rate of change. + +### Technical Debt + +The term "technical debt" in software engineering was coined by Ward Cunningham, Cunningham used the metaphor of financial debt to describe the trade-off between implementing a quick and dirty solution to meet immediate needs (similar to taking on financial debt for short-term gain) versus taking the time to do it properly with a more sustainable and maintainable solution (akin to avoiding financial debt but requiring more upfront investment). Just as financial debt accumulates interest over time, technical debt can accumulate and make future development more difficult and expensive. + +The idea behind technical debt is to highlight the consequences of prioritizing short-term gains over long-term maintainability and the need to address and pay off this "debt" through proper refactoring and improvements. The term has since become widely adopted in the software development community to describe the accrued cost of deferred work on a software project. + +### Technical Debt in Machine Learning + +Originally a software engineering concept, Technical debt is also relevant to Machine Learning Systems infact the landmark google paper .css-118vsk3{line-height:22px;padding:var(--ds-space-025,2px) 0px;display:inline;-webkit-box-decoration-break:clone;box-decoration-break:clone;border-radius:var(--ds-border-radius-100,4px);color:var(--ds-link,#0052CC);background-color:var(--ds-surface-raised,white);-webkit-user-select:text;-moz-user-select:text;-ms-user-select:text;user-select:text;border:1px solid var(--ds-border,#DFE1E6);-webkit-transition:0.1s all ease-in-out;transition:0.1s all ease-in-out;-moz-user-select:none;}.css-118vsk3:hover{border-color:var(--ds-border-accent-blue,#2684FF);}.css-118vsk3,.css-118vsk3:hover,.css-118vsk3:focus,.css-118vsk3:active{-webkit-text-decoration:none;text-decoration:none;}.css-118vsk3:active{background-color:var(--ds-background-selected,#DEEBFF);}.css-118vsk3:focus{cursor:pointer;box-shadow:0 0 0 2px var(--ds-border-selected,#4C9AFF);outline:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;}.css-118vsk3:focus,.css-118vsk3:focus:hover,.css-118vsk3:focus:focus,.css-118vsk3:focus:active{-webkit-text-decoration:none;text-decoration:none;}.css-118vsk3:focus:hover{border:1px solid var(--ds-border,#DFE1E6);}.css-1cwva94{white-space:pre-wrap;word-break:break-all;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding:var(--ds-space-025,2px) var(--ds-space-050,4px);}.css-10y2gog{color:var(--ds-link,#0052CC);}.css-10y2gog:hover{-webkit-text-decoration:none;text-decoration:none;}[.css-1lcr4h8{margin-right:var(--ds-space-050,4px);position:relative;display:inline-block;}.css-5j6uzt{white-space:pre-wrap;word-break:break-all;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding:var(--ds-space-025,2px) var(--ds-space-050,4px);vertical-align:text-bottom;padding:0px;}https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) suggest that ML systems have the propensity to easily gain this technical debt. + +> Machine learning offers a fantastically powerful toolkit for building useful complex prediction systems quickly. This paper argues it is dangerous to think of these quick wins as coming for free. Using the software engineering framework of technical debt , we find it is common to incur massive ongoing maintenance costs in real-world ML systems +> +> [https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) + +> As the machine learning (ML) community continues to accumulate years of experience with livesystems, a wide-spread and uncomfortable trend has emerged: developing and deploying ML sys-tems is relatively fast and cheap, but maintaining them over time is difficult and expensive +> +> [https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) + +Technical debt is important to consider especially when trying to move fast. Moving fast is easy, moving fast without acquiring technical debt is alot more complicated. + +The Evolution Of ML Platforms +----------------------------- + +### DevOps -- The paradigm shift that led the way + +DevOps is a methodology in software development which advocates for teams owning the entire software development lifecycle. This paradigm shift from fragmented teams to end-to-end ownership enhances collaboration and accelerates delivery. Dev ops has become standard practice in modern software development. The adoption of DevOps has been widespread across various industries, with many organizations considering it an essential part of their software development and delivery processes. Some of the principles of DevOps are: + +1. **Automation** + +2. **Continuous Testing** + +3. **Continuous Monitoring** + +4. **Collaboration and Communication** + +5. **Version Control** + +6. **Feedback Loops** + + +### Platforms -- Reducing Cognitive Load + +This shift to DevOps and teams teams owning the entire development lifecycle introduces a new challenge—additional cognitive load. Cognitive load can be defined as + +> The total amount of mental effort a team uses to understand, operate and maintain their designated systems or tasks. +> +> — [](https://teamtopologies.com/book "https://teamtopologies.com/book")[https://teamtopologies.com/book](https://teamtopologies.com/book) + +As teams grapple with the mental effort required by adopting DevOps of understanding, operating, and maintaining systems, cognitive load becomes a barrier to efficiency. The weight of this additional load can hinder productivity, prompting organizations to seek solutions. + +Platforms emerged as a strategic solution, delicately abstracting unnecessary details of the development lifecycle. This abstraction allows engineers to focus on critical tasks, mitigating cognitive load and fostering a more streamlined workflow. + +> The purpose of a platform team is to enable stream-aligned teams to deliver work with substantial autonomy. The stream-aligned team maintains full ownership of building, running, and fixing their application in production. The platform team provides internal services to reduce the cognitive load that would be required from stream-aligned teams to develop these underlying services. +> +> — [](https://teamtopologies.com/book "https://teamtopologies.com/book")[https://teamtopologies.com/book](https://teamtopologies.com/book) + +> _Infrastructure Platform teams enable organisations to scale delivery by solving common product and non-functional requirements with resilient solutions. This allows other teams to focus on building their own things and releasing value for their users_ +> +> \- [https://martinfowler.com/articles/building-infrastructure-platform.html](https://martinfowler.com/articles/building-infrastructure-platform.html) + +### ML Ops -- Reducing technical debt of machine learning + +The ability of ML systems to rapidly accumulate technical debt has given rise to the concept of MLOps, a methodology that takes inspiration from and incorporates best practices of the DevOps , tailoring them to address the distinctive challenges and workflows inherent in machine learning and controlling technical debt. MLOps seamlessly applies the established principles of DevOps to the intricate landscape of machine learning, recognizing that merely a fraction of real-world ML systems comprises the actual ML code. Serving as a crucial bridge between development and the ongoing intricacies of maintaining ML systems. + +Some examples of concepts of DevOps applied to ML (aka ML Ops) are: + +1. **Automation:** + + 1. Automation can be applied to many parts of the machine learning lifecycle. The incorporation of automation not only streamlines processes but also addresses technical debt through the establishment of consistency and a standardized and reproducible approach. + + 2. Model deployments which can be automated by the implementation of DevOps CI/CD strategies. + + 3. Automation can also be applied to retraining of machine learning models + +2. **Continuous** **Testing:** + + * Continuous testing can be applied as part of a model deployment pipeline, removing the need for manual testing (increasing development velocity) and removing technical debt by ensuring tests are performed consistently + + * Model validation can be automated using tooling providing consistency between training iterations. + +3. **Monitoring:** + + * Monitoring provides key insights and a steps towards creating vital feedback loops. + + * Monitoring can be applied to real time inference infrastructure revealing performance concerns similar to dev ops. + + * Monitoring can be applied to Model performance and monitor for model drift in realtime, providing realtime insight and analysis to model performance and when it may need to be retrained. + +4. **Collaboration and Communication:** + + * Utilize collaboration tools for effective communication and information sharing among team members. + + * Feature Store provides a platform for discovering, re using and collaborating on ML features + + * Model Database provides platform for discovering, re using and collaborating on ML Models + +5. **Version Control:** + + * Applying version control to experiments, machine learning models and features provides + + +MLOps is a methodology that provides a collection of concepts and workflows designed to promote efficiency, collaboration, and sustainability of the ML Lifecycle. MLOps plays a pivotal role in ensuring the efficiency, reliability, and scalability of machine learning implementations over time. + +The Rise of Machine Learning Platform +------------------------------------- + +The paradigm shifts of DevOps, MLOps and Platform Thinking led to the emergence of Machine Learning platforms. ML platforms are the application of MLOps concepts and workflows and provide a curated developer experience for Machine Learning developers throughout the entire ML lifecycle. These platforms address the challenges of cognitive load, technical debt, quality and developer velocity and increase efficiency, collaboration, and sustainability. As the ML team grows, the benefits amplify, creating a multiplier effect that allows organizations to scale whilst maintaining quality. + +### Benefits to the Organization + +The adoption of a Machine Learning Platform unfolds a spectrum of benefits: + +**Increasing Flow of Change (aka developer velocity):** A swift pace in model development and deployment, enhancing overall efficiency. + +**Fostering Collaboration Amongst Teams:** Breaking down silos and promoting cross-functional collaboration. The platform becomes the silent foundation for collaboration, facilitating a harmonious working environment. + +**Enforcing Best Practices:** Standardizing and ensuring adherence to best practices across ML projects. + +**Reducing/Limiting Technical Debt:** Strategically mitigating the risk of accumulating technical debt, ensuring long-term sustainability. + +**Multiplier Effect:** As the ML team grows, these benefits of the platform amplify—a dividend that multiplies with organizational growth. + +References +---------- + +[https://www.youtube.com/watch?v=Bfhl8kcSaEI&embeds\_referring\_euri=https%3A%2F%2Fplatformengineering.org%2F&feature=emb\_imp\_woyt](https://www.youtube.com/watch?v=Bfhl8kcSaEI&embeds_referring_euri=https%3A%2F%2Fplatformengineering.org%2F&feature=emb_imp_woyt) + +[https://www.atlassian.com/devops/frameworks/team-topologies](https://www.atlassian.com/devops/frameworks/team-topologies) + +[https://platformengineering.org/blog/what-is-platform-engineering](https://platformengineering.org/blog/what-is-platform-engineering) + +[https://www.thoughtworks.com/insights/blog/platforms/art-platform-thinking](https://www.thoughtworks.com/insights/blog/platforms/art-platform-thinking) + +[https://www.scribd.com/document/611845499/Whitepaper-State-of-Platform-Engineering-Report](https://www.scribd.com/document/611845499/Whitepaper-State-of-Platform-Engineering-Report) + +[https://martinfowler.com/bliki/ConwaysLaw.html](https://martinfowler.com/bliki/ConwaysLaw.html) + +[https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) + +[https://martinfowler.com/articles/building-infrastructure-platform.html](https://martinfowler.com/articles/building-infrastructure-platform.html) + +[https://martinfowler.com/articles/platform-teams-stuff-done.html](https://martinfowler.com/articles/platform-teams-stuff-done.html) + +[https://martinfowler.com/articles/talk-about-platforms.html](https://martinfowler.com/articles/talk-about-platforms.html) + +[https://www.techopedia.com/definition/27913/technical-debt](https://www.techopedia.com/definition/27913/technical-debt) From c52ab4558dbfe25fff7c36db28d3c9b2ce95f563 Mon Sep 17 00:00:00 2001 From: ben Date: Thu, 1 Feb 2024 10:52:30 -0800 Subject: [PATCH 29/55] remove title of contents --- _posts/2024-02-01-evolution-of-mlplatform.md | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/_posts/2024-02-01-evolution-of-mlplatform.md b/_posts/2024-02-01-evolution-of-mlplatform.md index fa11b2d..3d9e94e 100644 --- a/_posts/2024-02-01-evolution-of-mlplatform.md +++ b/_posts/2024-02-01-evolution-of-mlplatform.md @@ -11,17 +11,6 @@ tags: Technical Debt is not unique to Software Engineering and is a concept applicable to production Machine Learning (ML) at scale. Machine Learning Platforms (ML Platforms) have the potential to be a key component to achieving production ML at scale without large technical debt, yet ML Platforms are not often well understood. This document outlines the key concepts and paradigm shifts that led to the conceptualization of ML Platforms and how ML Platforms can act as a key to unlocking Development Velocity without Technical debt. -* 1 [Technical Debt and development velocity defined](#Technical-Debt-and-development-velocity-defined) - * 1.1 [Development Velocity](#Development-Velocity) - * 1.2 [Technical Debt](#Technical-Debt) - * 1.3 [Technical Debt in Machine Learning](#Technical-Debt-in-Machine-Learning) -* 2 [The Evolution Of ML Platforms](#The-Evolution-Of-ML-Platforms) - * 2.1 [DevOps -- The paradigm shift that led the way](#DevOps----The-paradigm-shift-that-led-the-way) - * 2.2 [Platforms -- Reducing Cognitive Load](#Platforms----Reducing-Cognitive-Load) - * 2.3 [ML Ops -- Reducing technical debt of machine learning](#ML-Ops----Reducing-technical-debt-of-machine-learning) -* 3 [The Rise of Machine Learning Platform](#The-Rise-of-Machine-Learning-Platform) - * 3.1 [Benefits to the Organization](#Benefits-to-the-Organization) -* 4 [References](#References) Technical Debt and development velocity defined ----------------------------------------------- From 6312feceb85d18fdf8dc3601fd2092f5cc2df9a0 Mon Sep 17 00:00:00 2001 From: ben Date: Thu, 1 Feb 2024 10:57:16 -0800 Subject: [PATCH 30/55] remove formatting paste --- _posts/2024-02-01-evolution-of-mlplatform.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_posts/2024-02-01-evolution-of-mlplatform.md b/_posts/2024-02-01-evolution-of-mlplatform.md index 3d9e94e..0311732 100644 --- a/_posts/2024-02-01-evolution-of-mlplatform.md +++ b/_posts/2024-02-01-evolution-of-mlplatform.md @@ -27,7 +27,7 @@ The idea behind technical debt is to highlight the consequences of prioritizing ### Technical Debt in Machine Learning -Originally a software engineering concept, Technical debt is also relevant to Machine Learning Systems infact the landmark google paper .css-118vsk3{line-height:22px;padding:var(--ds-space-025,2px) 0px;display:inline;-webkit-box-decoration-break:clone;box-decoration-break:clone;border-radius:var(--ds-border-radius-100,4px);color:var(--ds-link,#0052CC);background-color:var(--ds-surface-raised,white);-webkit-user-select:text;-moz-user-select:text;-ms-user-select:text;user-select:text;border:1px solid var(--ds-border,#DFE1E6);-webkit-transition:0.1s all ease-in-out;transition:0.1s all ease-in-out;-moz-user-select:none;}.css-118vsk3:hover{border-color:var(--ds-border-accent-blue,#2684FF);}.css-118vsk3,.css-118vsk3:hover,.css-118vsk3:focus,.css-118vsk3:active{-webkit-text-decoration:none;text-decoration:none;}.css-118vsk3:active{background-color:var(--ds-background-selected,#DEEBFF);}.css-118vsk3:focus{cursor:pointer;box-shadow:0 0 0 2px var(--ds-border-selected,#4C9AFF);outline:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;}.css-118vsk3:focus,.css-118vsk3:focus:hover,.css-118vsk3:focus:focus,.css-118vsk3:focus:active{-webkit-text-decoration:none;text-decoration:none;}.css-118vsk3:focus:hover{border:1px solid var(--ds-border,#DFE1E6);}.css-1cwva94{white-space:pre-wrap;word-break:break-all;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding:var(--ds-space-025,2px) var(--ds-space-050,4px);}.css-10y2gog{color:var(--ds-link,#0052CC);}.css-10y2gog:hover{-webkit-text-decoration:none;text-decoration:none;}[.css-1lcr4h8{margin-right:var(--ds-space-050,4px);position:relative;display:inline-block;}.css-5j6uzt{white-space:pre-wrap;word-break:break-all;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding:var(--ds-space-025,2px) var(--ds-space-050,4px);vertical-align:text-bottom;padding:0px;}https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) suggest that ML systems have the propensity to easily gain this technical debt. +Originally a software engineering concept, Technical debt is also relevant to Machine Learning Systems infact the landmark[https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems](google paper)suggest that ML systems have the propensity to easily gain this technical debt. > Machine learning offers a fantastically powerful toolkit for building useful complex prediction systems quickly. This paper argues it is dangerous to think of these quick wins as coming for free. Using the software engineering framework of technical debt , we find it is common to incur massive ongoing maintenance costs in real-world ML systems > From 6875996ff4a635f629fc052720526c457ef96226 Mon Sep 17 00:00:00 2001 From: ben Date: Thu, 1 Feb 2024 10:57:59 -0800 Subject: [PATCH 31/55] fix formatting --- _posts/2024-02-01-evolution-of-mlplatform.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_posts/2024-02-01-evolution-of-mlplatform.md b/_posts/2024-02-01-evolution-of-mlplatform.md index 0311732..aab2420 100644 --- a/_posts/2024-02-01-evolution-of-mlplatform.md +++ b/_posts/2024-02-01-evolution-of-mlplatform.md @@ -27,7 +27,7 @@ The idea behind technical debt is to highlight the consequences of prioritizing ### Technical Debt in Machine Learning -Originally a software engineering concept, Technical debt is also relevant to Machine Learning Systems infact the landmark[https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems](google paper)suggest that ML systems have the propensity to easily gain this technical debt. +Originally a software engineering concept, Technical debt is also relevant to Machine Learning Systems infact the landmark [https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems](google paper)suggest that ML systems have the propensity to easily gain this technical debt. > Machine learning offers a fantastically powerful toolkit for building useful complex prediction systems quickly. This paper argues it is dangerous to think of these quick wins as coming for free. Using the software engineering framework of technical debt , we find it is common to incur massive ongoing maintenance costs in real-world ML systems > From 91a5328a67e4ec90e1cfff57baaf9b524c2ede11 Mon Sep 17 00:00:00 2001 From: Ben Shaw Date: Thu, 1 Feb 2024 11:00:24 -0800 Subject: [PATCH 32/55] Update 2024-02-01-evolution-of-mlplatform.md fixing formatting --- _posts/2024-02-01-evolution-of-mlplatform.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_posts/2024-02-01-evolution-of-mlplatform.md b/_posts/2024-02-01-evolution-of-mlplatform.md index aab2420..51e7438 100644 --- a/_posts/2024-02-01-evolution-of-mlplatform.md +++ b/_posts/2024-02-01-evolution-of-mlplatform.md @@ -27,7 +27,7 @@ The idea behind technical debt is to highlight the consequences of prioritizing ### Technical Debt in Machine Learning -Originally a software engineering concept, Technical debt is also relevant to Machine Learning Systems infact the landmark [https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems](google paper)suggest that ML systems have the propensity to easily gain this technical debt. +Originally a software engineering concept, Technical debt is also relevant to Machine Learning Systems infact the landmark google paper suggest that ML systems have the propensity to easily gain this technical debt. > Machine learning offers a fantastically powerful toolkit for building useful complex prediction systems quickly. This paper argues it is dangerous to think of these quick wins as coming for free. Using the software engineering framework of technical debt , we find it is common to incur massive ongoing maintenance costs in real-world ML systems > From fa37acd9e3c0ff3d6759eee3a42257774cdb6e4f Mon Sep 17 00:00:00 2001 From: Ben Shaw Date: Fri, 2 Feb 2024 10:06:29 -0800 Subject: [PATCH 33/55] Update 2024-02-01-evolution-of-mlplatform.md --- _posts/2024-02-01-evolution-of-mlplatform.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/_posts/2024-02-01-evolution-of-mlplatform.md b/_posts/2024-02-01-evolution-of-mlplatform.md index 51e7438..94847c3 100644 --- a/_posts/2024-02-01-evolution-of-mlplatform.md +++ b/_posts/2024-02-01-evolution-of-mlplatform.md @@ -1,5 +1,6 @@ --- -layout: post title: "The Evolution of the Machine Learning Platform" +layout: +post title: "The Evolution of the Machine Learning Platform" team: Machine Learning Platform author: bshaw tags: From 264713b08923ba2861c2961e88698e3fc8ed1549 Mon Sep 17 00:00:00 2001 From: Ben Shaw Date: Fri, 2 Feb 2024 10:07:04 -0800 Subject: [PATCH 34/55] Update 2024-02-01-evolution-of-mlplatform.md --- _posts/2024-02-01-evolution-of-mlplatform.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_posts/2024-02-01-evolution-of-mlplatform.md b/_posts/2024-02-01-evolution-of-mlplatform.md index 94847c3..6c65504 100644 --- a/_posts/2024-02-01-evolution-of-mlplatform.md +++ b/_posts/2024-02-01-evolution-of-mlplatform.md @@ -1,6 +1,6 @@ --- -layout: -post title: "The Evolution of the Machine Learning Platform" +layout:post +title: "The Evolution of the Machine Learning Platform" team: Machine Learning Platform author: bshaw tags: From 9937bf093749a914ec733b59dec51f4995f0fe1c Mon Sep 17 00:00:00 2001 From: Ben Shaw Date: Fri, 2 Feb 2024 10:07:54 -0800 Subject: [PATCH 35/55] Update 2024-02-01-evolution-of-mlplatform.md --- _posts/2024-02-01-evolution-of-mlplatform.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_posts/2024-02-01-evolution-of-mlplatform.md b/_posts/2024-02-01-evolution-of-mlplatform.md index 6c65504..42eac4e 100644 --- a/_posts/2024-02-01-evolution-of-mlplatform.md +++ b/_posts/2024-02-01-evolution-of-mlplatform.md @@ -1,5 +1,5 @@ --- -layout:post +layout: post title: "The Evolution of the Machine Learning Platform" team: Machine Learning Platform author: bshaw From 21f6da921848ca8dd53b63fd287c19b7a4eb62d7 Mon Sep 17 00:00:00 2001 From: Ben Shaw Date: Fri, 2 Feb 2024 15:40:49 -0800 Subject: [PATCH 36/55] Update 2024-02-01-evolution-of-mlplatform.md --- _posts/2024-02-01-evolution-of-mlplatform.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_posts/2024-02-01-evolution-of-mlplatform.md b/_posts/2024-02-01-evolution-of-mlplatform.md index 42eac4e..054145e 100644 --- a/_posts/2024-02-01-evolution-of-mlplatform.md +++ b/_posts/2024-02-01-evolution-of-mlplatform.md @@ -2,7 +2,7 @@ layout: post title: "The Evolution of the Machine Learning Platform" team: Machine Learning Platform -author: bshaw +author: benshaw tags: - ml - mlops From 62c25c6a5e962853264f811957a35e921f66e5b0 Mon Sep 17 00:00:00 2001 From: Ben Shaw Date: Fri, 2 Feb 2024 15:45:23 -0800 Subject: [PATCH 37/55] Update authors.yml --- _data/authors.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/_data/authors.yml b/_data/authors.yml index 1ffec72..69d8120 100644 --- a/_data/authors.yml +++ b/_data/authors.yml @@ -3,6 +3,13 @@ # description, etc --- +bshaw: + name: Ben Shaw + github: benshaw + twitter: ben_a_shaw + about: | + Ben leads the ML Platform group, helping scale production Machine Learning at scribd. Other times you will find him outside playing in the mountains. + alexjb: name: Alex Bernardin github: alexofmanytrades From d7b3e02599b6bef22ecc225798f8b88c6b2d06e5 Mon Sep 17 00:00:00 2001 From: Ben Shaw Date: Fri, 2 Feb 2024 15:47:03 -0800 Subject: [PATCH 38/55] Update 2024-02-01-evolution-of-mlplatform.md --- _posts/2024-02-01-evolution-of-mlplatform.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_posts/2024-02-01-evolution-of-mlplatform.md b/_posts/2024-02-01-evolution-of-mlplatform.md index 054145e..42eac4e 100644 --- a/_posts/2024-02-01-evolution-of-mlplatform.md +++ b/_posts/2024-02-01-evolution-of-mlplatform.md @@ -2,7 +2,7 @@ layout: post title: "The Evolution of the Machine Learning Platform" team: Machine Learning Platform -author: benshaw +author: bshaw tags: - ml - mlops From 817346f143b9481667f48f29073090b172c542f5 Mon Sep 17 00:00:00 2001 From: ben Date: Fri, 2 Feb 2024 17:44:29 -0800 Subject: [PATCH 39/55] fixed tags --- _posts/2024-02-01-evolution-of-mlplatform.md | 5 ++--- tag/mlops/index.md | 6 ++++++ 2 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 tag/mlops/index.md diff --git a/_posts/2024-02-01-evolution-of-mlplatform.md b/_posts/2024-02-01-evolution-of-mlplatform.md index 42eac4e..6829bf0 100644 --- a/_posts/2024-02-01-evolution-of-mlplatform.md +++ b/_posts/2024-02-01-evolution-of-mlplatform.md @@ -4,10 +4,9 @@ title: "The Evolution of the Machine Learning Platform" team: Machine Learning Platform author: bshaw tags: -- ml - mlops -- devops -- platform +- featured +- ml-platform-series --- Technical Debt is not unique to Software Engineering and is a concept applicable to production Machine Learning (ML) at scale. Machine Learning Platforms (ML Platforms) have the potential to be a key component to achieving production ML at scale without large technical debt, yet ML Platforms are not often well understood. This document outlines the key concepts and paradigm shifts that led to the conceptualization of ML Platforms and how ML Platforms can act as a key to unlocking Development Velocity without Technical debt. diff --git a/tag/mlops/index.md b/tag/mlops/index.md new file mode 100644 index 0000000..b51bead --- /dev/null +++ b/tag/mlops/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: mlops" +tag: mlops +robots: noindex +--- From 9ea9a10f9d04a138f7312f8c215d8c59224328ac Mon Sep 17 00:00:00 2001 From: ben Date: Fri, 2 Feb 2024 17:55:19 -0800 Subject: [PATCH 40/55] always fixing --- _posts/2024-02-01-evolution-of-mlplatform.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/_posts/2024-02-01-evolution-of-mlplatform.md b/_posts/2024-02-01-evolution-of-mlplatform.md index 6829bf0..b17f0ca 100644 --- a/_posts/2024-02-01-evolution-of-mlplatform.md +++ b/_posts/2024-02-01-evolution-of-mlplatform.md @@ -9,7 +9,7 @@ tags: - ml-platform-series --- -Technical Debt is not unique to Software Engineering and is a concept applicable to production Machine Learning (ML) at scale. Machine Learning Platforms (ML Platforms) have the potential to be a key component to achieving production ML at scale without large technical debt, yet ML Platforms are not often well understood. This document outlines the key concepts and paradigm shifts that led to the conceptualization of ML Platforms and how ML Platforms can act as a key to unlocking Development Velocity without Technical debt. +Machine Learning Platforms (ML Platforms) have the potential to be a key component to achieving production ML at scale without large technical debt, yet ML Platforms are not often well understood. This document outlines the key concepts and paradigm shifts that led to the conceptualization of ML Platforms in an effort to increase an understanding of these and how they can best be applied to bring value. Technical Debt and development velocity defined @@ -81,7 +81,7 @@ Platforms emerged as a strategic solution, delicately abstracting unnecessary de ### ML Ops -- Reducing technical debt of machine learning -The ability of ML systems to rapidly accumulate technical debt has given rise to the concept of MLOps, a methodology that takes inspiration from and incorporates best practices of the DevOps , tailoring them to address the distinctive challenges and workflows inherent in machine learning and controlling technical debt. MLOps seamlessly applies the established principles of DevOps to the intricate landscape of machine learning, recognizing that merely a fraction of real-world ML systems comprises the actual ML code. Serving as a crucial bridge between development and the ongoing intricacies of maintaining ML systems. +The ability of ML systems to rapidly accumulate technical debt has given rise to the concept of MLOps, a methodology that takes inspiration from and incorporates best practices of the DevOps , tailoring them to address the distinctive challenges and workflows inherent in machine learning in an effort to control technical debt. MLOps applies the established principles of DevOps to machine learning, recognizing that merely a fraction of real-world ML systems comprises the actual ML code. Serving as a crucial bridge between development and the ongoing intricacies of maintaining ML systems. Some examples of concepts of DevOps applied to ML (aka ML Ops) are: @@ -111,16 +111,16 @@ Some examples of concepts of DevOps applied to ML (aka ML Ops) are: * Utilize collaboration tools for effective communication and information sharing among team members. - * Feature Store provides a platform for discovering, re using and collaborating on ML features + * Feature Store's provides a platform for discovering, re using and collaborating on ML features - * Model Database provides platform for discovering, re using and collaborating on ML Models + * Model Database's provide a platform for discovering, re using and collaborating on ML Models 5. **Version Control:** - * Applying version control to experiments, machine learning models and features provides + * Applying version control to experiments, machine learning models and features provides better change management and auditing of these ML artifacts -MLOps is a methodology that provides a collection of concepts and workflows designed to promote efficiency, collaboration, and sustainability of the ML Lifecycle. MLOps plays a pivotal role in ensuring the efficiency, reliability, and scalability of machine learning implementations over time. +MLOps is a methodology that provides a collection of concepts and workflows designed to promote efficiency, collaboration, and sustainability of the ML Lifecycle. Correctly applied MLOps can play a pivotal role in ensuring the efficiency, reliability, and scalability of machine learning implementations over time. The Rise of Machine Learning Platform ------------------------------------- From e628fcaf7a38045bfbd0b8a78c2339d30a4b7d84 Mon Sep 17 00:00:00 2001 From: ben Date: Fri, 2 Feb 2024 17:58:24 -0800 Subject: [PATCH 41/55] update date till monday will release then --- ...ion-of-mlplatform.md => 2024-02-05-evolution-of-mlplatform.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename _posts/{2024-02-01-evolution-of-mlplatform.md => 2024-02-05-evolution-of-mlplatform.md} (100%) diff --git a/_posts/2024-02-01-evolution-of-mlplatform.md b/_posts/2024-02-05-evolution-of-mlplatform.md similarity index 100% rename from _posts/2024-02-01-evolution-of-mlplatform.md rename to _posts/2024-02-05-evolution-of-mlplatform.md From 3fb64427ffcf3e4288a9c33019187c621eb4e655 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Feb 2024 03:31:41 +0000 Subject: [PATCH 42/55] Bump nokogiri from 1.14.3 to 1.16.2 Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.14.3 to 1.16.2. - [Release notes](https://github.com/sparklemotion/nokogiri/releases) - [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md) - [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.14.3...v1.16.2) --- updated-dependencies: - dependency-name: nokogiri dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Gemfile.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 1a9ad5f..6194096 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -231,7 +231,7 @@ GEM jekyll-seo-tag (~> 2.1) minitest (5.17.0) multipart-post (2.1.1) - nokogiri (1.14.3-x86_64-linux) + nokogiri (1.16.2-x86_64-linux) racc (~> 1.4) octokit (4.22.0) faraday (>= 0.9) @@ -239,7 +239,7 @@ GEM pathutil (0.16.2) forwardable-extended (~> 2.6) public_suffix (4.0.7) - racc (1.6.2) + racc (1.7.3) rb-fsevent (0.11.1) rb-inotify (0.10.1) ffi (~> 1.0) From 48b6ec515f2606f463596fe1932ae1100d700b9f Mon Sep 17 00:00:00 2001 From: Ben Shaw Date: Tue, 6 Feb 2024 20:22:46 -0800 Subject: [PATCH 43/55] Update 2024-02-05-evolution-of-mlplatform.md --- _posts/2024-02-05-evolution-of-mlplatform.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_posts/2024-02-05-evolution-of-mlplatform.md b/_posts/2024-02-05-evolution-of-mlplatform.md index b17f0ca..dc2df23 100644 --- a/_posts/2024-02-05-evolution-of-mlplatform.md +++ b/_posts/2024-02-05-evolution-of-mlplatform.md @@ -9,7 +9,7 @@ tags: - ml-platform-series --- -Machine Learning Platforms (ML Platforms) have the potential to be a key component to achieving production ML at scale without large technical debt, yet ML Platforms are not often well understood. This document outlines the key concepts and paradigm shifts that led to the conceptualization of ML Platforms in an effort to increase an understanding of these and how they can best be applied to bring value. +Machine Learning Platforms (ML Platforms) have the potential to be a key component in achieving production ML at scale without large technical debt, yet ML Platforms are not often understood. This document outlines the key concepts and paradigm shifts that led to the conceptualization of ML Platforms in an effort to increase an understanding of these platforms and how they can best be applied. Technical Debt and development velocity defined From 88cce8e236c0dcc1d0b671991b3ec58992b12c9e Mon Sep 17 00:00:00 2001 From: Ben Shaw Date: Thu, 8 Feb 2024 19:59:01 -0800 Subject: [PATCH 44/55] Update 2024-02-05-evolution-of-mlplatform.md [WIP] refactor links and move benefits to bottom with more specific examples --- _posts/2024-02-05-evolution-of-mlplatform.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/_posts/2024-02-05-evolution-of-mlplatform.md b/_posts/2024-02-05-evolution-of-mlplatform.md index dc2df23..10943bc 100644 --- a/_posts/2024-02-05-evolution-of-mlplatform.md +++ b/_posts/2024-02-05-evolution-of-mlplatform.md @@ -30,7 +30,7 @@ The idea behind technical debt is to highlight the consequences of prioritizing Originally a software engineering concept, Technical debt is also relevant to Machine Learning Systems infact the landmark google paper suggest that ML systems have the propensity to easily gain this technical debt. > Machine learning offers a fantastically powerful toolkit for building useful complex prediction systems quickly. This paper argues it is dangerous to think of these quick wins as coming for free. Using the software engineering framework of technical debt , we find it is common to incur massive ongoing maintenance costs in real-world ML systems -> +> /todo fix link > [https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) > As the machine learning (ML) community continues to accumulate years of experience with livesystems, a wide-spread and uncomfortable trend has emerged: developing and deploying ML sys-tems is relatively fast and cheap, but maintaining them over time is difficult and expensive @@ -82,7 +82,15 @@ Platforms emerged as a strategic solution, delicately abstracting unnecessary de ### ML Ops -- Reducing technical debt of machine learning The ability of ML systems to rapidly accumulate technical debt has given rise to the concept of MLOps, a methodology that takes inspiration from and incorporates best practices of the DevOps , tailoring them to address the distinctive challenges and workflows inherent in machine learning in an effort to control technical debt. MLOps applies the established principles of DevOps to machine learning, recognizing that merely a fraction of real-world ML systems comprises the actual ML code. Serving as a crucial bridge between development and the ongoing intricacies of maintaining ML systems. +MLOps is a methodology that provides a collection of concepts and workflows designed to promote efficiency, collaboration, and sustainability of the ML Lifecycle. Correctly applied MLOps can play a pivotal role in ensuring the efficiency, reliability, and scalability of machine learning implementations over time. + +The Rise of Machine Learning Platform +------------------------------------- + +The paradigm shifts of DevOps, MLOps and Platform Thinking led to the emergence of Machine Learning platforms. ML platforms are the application of MLOps concepts and workflows and provide a curated developer experience for Machine Learning developers throughout the entire ML lifecycle. These platforms address the challenges of cognitive load, technical debt, quality and developer velocity and increase efficiency, collaboration, and sustainability. As the ML team grows, the benefits amplify, creating a multiplier effect that allows organizations to scale whilst maintaining quality. +### Scribd's ML Platform -- MLOps in Action +/todo Some examples of concepts of DevOps applied to ML (aka ML Ops) are: 1. **Automation:** @@ -120,13 +128,6 @@ Some examples of concepts of DevOps applied to ML (aka ML Ops) are: * Applying version control to experiments, machine learning models and features provides better change management and auditing of these ML artifacts -MLOps is a methodology that provides a collection of concepts and workflows designed to promote efficiency, collaboration, and sustainability of the ML Lifecycle. Correctly applied MLOps can play a pivotal role in ensuring the efficiency, reliability, and scalability of machine learning implementations over time. - -The Rise of Machine Learning Platform -------------------------------------- - -The paradigm shifts of DevOps, MLOps and Platform Thinking led to the emergence of Machine Learning platforms. ML platforms are the application of MLOps concepts and workflows and provide a curated developer experience for Machine Learning developers throughout the entire ML lifecycle. These platforms address the challenges of cognitive load, technical debt, quality and developer velocity and increase efficiency, collaboration, and sustainability. As the ML team grows, the benefits amplify, creating a multiplier effect that allows organizations to scale whilst maintaining quality. - ### Benefits to the Organization The adoption of a Machine Learning Platform unfolds a spectrum of benefits: From 65e33686d9292ec1842cc66e9d337a4c4e6729aa Mon Sep 17 00:00:00 2001 From: Ben Shaw Date: Fri, 9 Feb 2024 17:25:59 -0800 Subject: [PATCH 45/55] Update 2024-02-05-evolution-of-mlplatform.md fix links and add details about scribds ml platform --- _posts/2024-02-05-evolution-of-mlplatform.md | 69 ++++++-------------- 1 file changed, 21 insertions(+), 48 deletions(-) diff --git a/_posts/2024-02-05-evolution-of-mlplatform.md b/_posts/2024-02-05-evolution-of-mlplatform.md index 10943bc..8ae666d 100644 --- a/_posts/2024-02-05-evolution-of-mlplatform.md +++ b/_posts/2024-02-05-evolution-of-mlplatform.md @@ -30,12 +30,11 @@ The idea behind technical debt is to highlight the consequences of prioritizing Originally a software engineering concept, Technical debt is also relevant to Machine Learning Systems infact the landmark google paper suggest that ML systems have the propensity to easily gain this technical debt. > Machine learning offers a fantastically powerful toolkit for building useful complex prediction systems quickly. This paper argues it is dangerous to think of these quick wins as coming for free. Using the software engineering framework of technical debt , we find it is common to incur massive ongoing maintenance costs in real-world ML systems -> /todo fix link -> [https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) +> [Sculley et al (2021) Hidden Technical Debt in Machine Learning Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) > As the machine learning (ML) community continues to accumulate years of experience with livesystems, a wide-spread and uncomfortable trend has emerged: developing and deploying ML sys-tems is relatively fast and cheap, but maintaining them over time is difficult and expensive > -> [https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) +> [Sculley et al (2021) Hidden Technical Debt in Machine Learning Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) Technical debt is important to consider especially when trying to move fast. Moving fast is easy, moving fast without acquiring technical debt is alot more complicated. @@ -65,7 +64,7 @@ This shift to DevOps and teams teams owning the entire development lifecycle int > The total amount of mental effort a team uses to understand, operate and maintain their designated systems or tasks. > -> — [](https://teamtopologies.com/book "https://teamtopologies.com/book")[https://teamtopologies.com/book](https://teamtopologies.com/book) +> [Skelton & Pais (2019) Team Topologies](https://teamtopologies.com/book) As teams grapple with the mental effort required by adopting DevOps of understanding, operating, and maintaining systems, cognitive load becomes a barrier to efficiency. The weight of this additional load can hinder productivity, prompting organizations to seek solutions. @@ -73,11 +72,11 @@ Platforms emerged as a strategic solution, delicately abstracting unnecessary de > The purpose of a platform team is to enable stream-aligned teams to deliver work with substantial autonomy. The stream-aligned team maintains full ownership of building, running, and fixing their application in production. The platform team provides internal services to reduce the cognitive load that would be required from stream-aligned teams to develop these underlying services. > -> — [](https://teamtopologies.com/book "https://teamtopologies.com/book")[https://teamtopologies.com/book](https://teamtopologies.com/book) +> [Skelton & Pais (2019) Team Topologies](https://teamtopologies.com/book) -> _Infrastructure Platform teams enable organisations to scale delivery by solving common product and non-functional requirements with resilient solutions. This allows other teams to focus on building their own things and releasing value for their users_ +> Infrastructure Platform teams enable organisations to scale delivery by solving common product and non-functional requirements with resilient solutions. This allows other teams to focus on building their own things and releasing value for their users > -> \- [https://martinfowler.com/articles/building-infrastructure-platform.html](https://martinfowler.com/articles/building-infrastructure-platform.html) +> [Rowse & Shepherd (2022) Building Infrastructure Platforms](https://martinfowler.com/articles/building-infrastructure-platform.html) ### ML Ops -- Reducing technical debt of machine learning @@ -87,66 +86,40 @@ MLOps is a methodology that provides a collection of concepts and workflows desi The Rise of Machine Learning Platform ------------------------------------- -The paradigm shifts of DevOps, MLOps and Platform Thinking led to the emergence of Machine Learning platforms. ML platforms are the application of MLOps concepts and workflows and provide a curated developer experience for Machine Learning developers throughout the entire ML lifecycle. These platforms address the challenges of cognitive load, technical debt, quality and developer velocity and increase efficiency, collaboration, and sustainability. As the ML team grows, the benefits amplify, creating a multiplier effect that allows organizations to scale whilst maintaining quality. +The paradigm shifts of DevOps, MLOps and Platform Thinking led to the emergence of Machine Learning platforms. ML platforms are the application of MLOps concepts and workflows and provide a curated developer experience for Machine Learning developers throughout the entire ML lifecycle. As the ML team grows, the benefits of a platform amplify, creating a multiplier effect that allows organizations to scale whilst maintaining quality and not getting bogged down with technical debt. + ### Scribd's ML Platform -- MLOps in Action -/todo -Some examples of concepts of DevOps applied to ML (aka ML Ops) are: +At Scribd we have applied concepts from DevOps to our ML Operations in the following ways 1. **Automation:** - - 1. Automation can be applied to many parts of the machine learning lifecycle. The incorporation of automation not only streamlines processes but also addresses technical debt through the establishment of consistency and a standardized and reproducible approach. - - 2. Model deployments which can be automated by the implementation of DevOps CI/CD strategies. - - 3. Automation can also be applied to retraining of machine learning models + + * Applying CI/CD strategies to model deployments through the use of Jenkins pipelines which deploy models from the Model Registry to AWS based endpoints. + * Automating Model training throug the use of Airflow DAGS and allowing these DAGS to trigger the deployment pipelines to deploy a model once re-training has occured. 2. **Continuous** **Testing:** - * Continuous testing can be applied as part of a model deployment pipeline, removing the need for manual testing (increasing development velocity) and removing technical debt by ensuring tests are performed consistently - - * Model validation can be automated using tooling providing consistency between training iterations. + * Applying continuous testing as part of a model deployment pipeline, removing the need for manual testing. + * Increased tooling to support model validation testing. 3. **Monitoring:** - - * Monitoring provides key insights and a steps towards creating vital feedback loops. - - * Monitoring can be applied to real time inference infrastructure revealing performance concerns similar to dev ops. - * Monitoring can be applied to Model performance and monitor for model drift in realtime, providing realtime insight and analysis to model performance and when it may need to be retrained. + * Monitoring real time inference endpoints + * Monitoring training DAGS 4. **Collaboration and Communication:** - - * Utilize collaboration tools for effective communication and information sharing among team members. - - * Feature Store's provides a platform for discovering, re using and collaborating on ML features + + * Feature Store which provides feature discovery and re-use + * Model Database which provides model collaboration - * Model Database's provide a platform for discovering, re using and collaborating on ML Models - -5. **Version Control:** +6. **Version Control:** - * Applying version control to experiments, machine learning models and features provides better change management and auditing of these ML artifacts + * Applyied version control to experiments, machine learning models and features -### Benefits to the Organization - -The adoption of a Machine Learning Platform unfolds a spectrum of benefits: - -**Increasing Flow of Change (aka developer velocity):** A swift pace in model development and deployment, enhancing overall efficiency. - -**Fostering Collaboration Amongst Teams:** Breaking down silos and promoting cross-functional collaboration. The platform becomes the silent foundation for collaboration, facilitating a harmonious working environment. - -**Enforcing Best Practices:** Standardizing and ensuring adherence to best practices across ML projects. - -**Reducing/Limiting Technical Debt:** Strategically mitigating the risk of accumulating technical debt, ensuring long-term sustainability. - -**Multiplier Effect:** As the ML team grows, these benefits of the platform amplify—a dividend that multiplies with organizational growth. - References ---------- -[https://www.youtube.com/watch?v=Bfhl8kcSaEI&embeds\_referring\_euri=https%3A%2F%2Fplatformengineering.org%2F&feature=emb\_imp\_woyt](https://www.youtube.com/watch?v=Bfhl8kcSaEI&embeds_referring_euri=https%3A%2F%2Fplatformengineering.org%2F&feature=emb_imp_woyt) - [https://www.atlassian.com/devops/frameworks/team-topologies](https://www.atlassian.com/devops/frameworks/team-topologies) [https://platformengineering.org/blog/what-is-platform-engineering](https://platformengineering.org/blog/what-is-platform-engineering) From 698a85802a2d7dcba663078e7c6386f5e49bf2e5 Mon Sep 17 00:00:00 2001 From: Ben Shaw Date: Fri, 9 Feb 2024 17:43:05 -0800 Subject: [PATCH 46/55] Update 2024-02-05-evolution-of-mlplatform.md Fix references --- _posts/2024-02-05-evolution-of-mlplatform.md | 21 ++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/_posts/2024-02-05-evolution-of-mlplatform.md b/_posts/2024-02-05-evolution-of-mlplatform.md index 8ae666d..74655c1 100644 --- a/_posts/2024-02-05-evolution-of-mlplatform.md +++ b/_posts/2024-02-05-evolution-of-mlplatform.md @@ -30,6 +30,7 @@ The idea behind technical debt is to highlight the consequences of prioritizing Originally a software engineering concept, Technical debt is also relevant to Machine Learning Systems infact the landmark google paper suggest that ML systems have the propensity to easily gain this technical debt. > Machine learning offers a fantastically powerful toolkit for building useful complex prediction systems quickly. This paper argues it is dangerous to think of these quick wins as coming for free. Using the software engineering framework of technical debt , we find it is common to incur massive ongoing maintenance costs in real-world ML systems +> > [Sculley et al (2021) Hidden Technical Debt in Machine Learning Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) > As the machine learning (ML) community continues to accumulate years of experience with livesystems, a wide-spread and uncomfortable trend has emerged: developing and deploying ML sys-tems is relatively fast and cheap, but maintaining them over time is difficult and expensive @@ -120,22 +121,22 @@ At Scribd we have applied concepts from DevOps to our ML Operations in the follo References ---------- -[https://www.atlassian.com/devops/frameworks/team-topologies](https://www.atlassian.com/devops/frameworks/team-topologies) +[Bottcher (2018, March 05). What I Talk About When I Talk About Platforms. https://martinfowler.com/articles/talk-about-platforms.html](https://martinfowler.com/articles/talk-about-platforms.html) -[https://platformengineering.org/blog/what-is-platform-engineering](https://platformengineering.org/blog/what-is-platform-engineering) +[D. Sculley, Gary Holt, Daniel Golovin, Eugene Davydov, Todd Phillips, Dietmar Ebner, Vinay Chaudhary, Michael Young, Jean-Franc¸ois Crespo, Dan Dennison (2021) Hidden Technical Debt in Machine Learning Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) -[https://www.thoughtworks.com/insights/blog/platforms/art-platform-thinking](https://www.thoughtworks.com/insights/blog/platforms/art-platform-thinking) +[Fowler (2022, October 20).Conway's Law. https://martinfowler.com/bliki/ConwaysLaw.html](https://martinfowler.com/bliki/ConwaysLaw.html) -[https://www.scribd.com/document/611845499/Whitepaper-State-of-Platform-Engineering-Report](https://www.scribd.com/document/611845499/Whitepaper-State-of-Platform-Engineering-Report) +[Galante, what is platform engineering. https://platformengineering.org/blog/what-is-platform-engineering](https://platformengineering.org/blog/what-is-platform-engineering) -[https://martinfowler.com/bliki/ConwaysLaw.html](https://martinfowler.com/bliki/ConwaysLaw.html) +[Humanitect, State of Platform Engineering Report](https://www.scribd.com/document/611845499/Whitepaper-State-of-Platform-Engineering-Report) -[https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) +[Hodgson (2023, July 19).How platform teams get stuff done. https://martinfowler.com/articles/platform-teams-stuff-done.html](https://martinfowler.com/articles/platform-teams-stuff-done.html) -[https://martinfowler.com/articles/building-infrastructure-platform.html](https://martinfowler.com/articles/building-infrastructure-platform.html) +[Murray (2017, April 27. The Art of Platform Thinking. https://www.thoughtworks.com/insights/blog/platforms/art-platform-thinking)](https://www.thoughtworks.com/insights/blog/platforms/art-platform-thinking) -[https://martinfowler.com/articles/platform-teams-stuff-done.html](https://martinfowler.com/articles/platform-teams-stuff-done.html) +[Rouse (2017, March 20). Technical Debt. https://www.techopedia.com/definition/27913/technical-debt](https://www.techopedia.com/definition/27913/technical-debt) -[https://martinfowler.com/articles/talk-about-platforms.html](https://martinfowler.com/articles/talk-about-platforms.html) +[Rowse & Shepherd (2022).Building Infrastructure Platforms. https://martinfowler.com/articles/building-infrastructure-platform.html](https://martinfowler.com/articles/building-infrastructure-platform.html) -[https://www.techopedia.com/definition/27913/technical-debt](https://www.techopedia.com/definition/27913/technical-debt) +[Skelton & Pais (2019) Team Topologies](https://teamtopologies.com/book) From 9a88cbaaf3c5f56e625297c6ec96f66fc6a112ba Mon Sep 17 00:00:00 2001 From: Ben Shaw Date: Fri, 9 Feb 2024 17:53:45 -0800 Subject: [PATCH 47/55] Update 2024-02-05-evolution-of-mlplatform.md Reduce cruft --- _posts/2024-02-05-evolution-of-mlplatform.md | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/_posts/2024-02-05-evolution-of-mlplatform.md b/_posts/2024-02-05-evolution-of-mlplatform.md index 74655c1..089d8a1 100644 --- a/_posts/2024-02-05-evolution-of-mlplatform.md +++ b/_posts/2024-02-05-evolution-of-mlplatform.md @@ -17,7 +17,7 @@ Technical Debt and development velocity defined ### Development Velocity -Machine learning development velocity refers to the speed and efficiency at which machine learning (ML) projects progress from the initial concept to deployment and maintenance. It encompasses the entire lifecycle of a machine learning project, from data collection and preprocessing to model training, evaluation, deployment, and ongoing optimization. In platform engineering this is often referred to as rate of change. +Machine learning development velocity refers to the speed and efficiency at which machine learning (ML) projects progress from the initial concept to deployment in a production environment. It encompasses the entire lifecycle of a machine learning project, from data collection and preprocessing to model training, evaluation, validation deployment and testing for new models or for re-training, validation and deployment of existing models. ### Technical Debt @@ -44,7 +44,7 @@ The Evolution Of ML Platforms ### DevOps -- The paradigm shift that led the way -DevOps is a methodology in software development which advocates for teams owning the entire software development lifecycle. This paradigm shift from fragmented teams to end-to-end ownership enhances collaboration and accelerates delivery. Dev ops has become standard practice in modern software development. The adoption of DevOps has been widespread across various industries, with many organizations considering it an essential part of their software development and delivery processes. Some of the principles of DevOps are: +DevOps is a methodology in software development which advocates for teams owning the entire software development lifecycle. This paradigm shift from fragmented teams to end-to-end ownership enhances collaboration and accelerates delivery. Dev ops has become standard practice in modern software development and the adoption of DevOps has been widespread, with many organizations considering it an essential part of their software development and delivery processes. Some of the principles of DevOps are: 1. **Automation** @@ -67,7 +67,7 @@ This shift to DevOps and teams teams owning the entire development lifecycle int > > [Skelton & Pais (2019) Team Topologies](https://teamtopologies.com/book) -As teams grapple with the mental effort required by adopting DevOps of understanding, operating, and maintaining systems, cognitive load becomes a barrier to efficiency. The weight of this additional load can hinder productivity, prompting organizations to seek solutions. +The weight of the additional load introduced in DevOps of teams owning the entire software development lifecycle can hinder productivity, prompting organizations to seek solutions. Platforms emerged as a strategic solution, delicately abstracting unnecessary details of the development lifecycle. This abstraction allows engineers to focus on critical tasks, mitigating cognitive load and fostering a more streamlined workflow. @@ -81,17 +81,12 @@ Platforms emerged as a strategic solution, delicately abstracting unnecessary de ### ML Ops -- Reducing technical debt of machine learning -The ability of ML systems to rapidly accumulate technical debt has given rise to the concept of MLOps, a methodology that takes inspiration from and incorporates best practices of the DevOps , tailoring them to address the distinctive challenges and workflows inherent in machine learning in an effort to control technical debt. MLOps applies the established principles of DevOps to machine learning, recognizing that merely a fraction of real-world ML systems comprises the actual ML code. Serving as a crucial bridge between development and the ongoing intricacies of maintaining ML systems. -MLOps is a methodology that provides a collection of concepts and workflows designed to promote efficiency, collaboration, and sustainability of the ML Lifecycle. Correctly applied MLOps can play a pivotal role in ensuring the efficiency, reliability, and scalability of machine learning implementations over time. +The ability of ML systems to rapidly accumulate technical debt has given rise to the concept of MLOps. MLOps is a methodology that takes inspiration from and incorporates best practices of the DevOps, tailoring them to address the distinctive challenges inherent in machine learning. MLOps applies the established principles of DevOps to machine learning, recognizing that merely a fraction of real-world ML systems comprises the actual ML code. Serving as a crucial bridge between development and the ongoing intricacies of maintaining ML systems. +MLOps is a methodology that provides a collection of concepts and workflows designed to promote efficiency, collaboration, and sustainability of the ML Lifecycle. Correctly applied MLOps can play a pivotal role controlling technical debt and ensuring the efficiency, reliability, and scalability of the machine learning lifecycle over time. -The Rise of Machine Learning Platform +Scribd's ML Platform -- MLOps and Platforms in Action ------------------------------------- - -The paradigm shifts of DevOps, MLOps and Platform Thinking led to the emergence of Machine Learning platforms. ML platforms are the application of MLOps concepts and workflows and provide a curated developer experience for Machine Learning developers throughout the entire ML lifecycle. As the ML team grows, the benefits of a platform amplify, creating a multiplier effect that allows organizations to scale whilst maintaining quality and not getting bogged down with technical debt. - - -### Scribd's ML Platform -- MLOps in Action -At Scribd we have applied concepts from DevOps to our ML Operations in the following ways +At Scribd we have developed a machine learning platform which provides a curated developer experience for machine learning developers and applies the concepts of DevOps in the following ways 1. **Automation:** From 441d41a44a3a230c51041429ccb969ac95d90ad5 Mon Sep 17 00:00:00 2001 From: ben Date: Thu, 15 Feb 2024 17:18:12 -0800 Subject: [PATCH 48/55] refined scribs platform section --- _posts/2024-02-05-evolution-of-mlplatform.md | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/_posts/2024-02-05-evolution-of-mlplatform.md b/_posts/2024-02-05-evolution-of-mlplatform.md index 089d8a1..37f22c2 100644 --- a/_posts/2024-02-05-evolution-of-mlplatform.md +++ b/_posts/2024-02-05-evolution-of-mlplatform.md @@ -86,31 +86,27 @@ MLOps is a methodology that provides a collection of concepts and workflows desi Scribd's ML Platform -- MLOps and Platforms in Action ------------------------------------- -At Scribd we have developed a machine learning platform which provides a curated developer experience for machine learning developers and applies the concepts of DevOps in the following ways +At Scribd we have developed a machine learning platform which provides a curated developer experience for machine learning developers. This platform has been built with MLOps in mind which can be seen through its use of common DevOps principles. -1. **Automation:** - +1. **Automation:** * Applying CI/CD strategies to model deployments through the use of Jenkins pipelines which deploy models from the Model Registry to AWS based endpoints. * Automating Model training throug the use of Airflow DAGS and allowing these DAGS to trigger the deployment pipelines to deploy a model once re-training has occured. 2. **Continuous** **Testing:** - * Applying continuous testing as part of a model deployment pipeline, removing the need for manual testing. * Increased tooling to support model validation testing. 3. **Monitoring:** - * Monitoring real time inference endpoints * Monitoring training DAGS + * Monitoring batch jobs 4. **Collaboration and Communication:** - * Feature Store which provides feature discovery and re-use * Model Database which provides model collaboration 6. **Version Control:** - - * Applyied version control to experiments, machine learning models and features + * Applying version control to experiments, machine learning models and features References From d869fcdcd8f973753fe8f01a0868e2c8f4957f91 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 May 2024 23:38:52 +0000 Subject: [PATCH 49/55] Bump nokogiri from 1.16.2 to 1.16.5 Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.16.2 to 1.16.5. - [Release notes](https://github.com/sparklemotion/nokogiri/releases) - [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md) - [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.16.2...v1.16.5) --- updated-dependencies: - dependency-name: nokogiri dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index 6194096..07c6d1a 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -231,7 +231,7 @@ GEM jekyll-seo-tag (~> 2.1) minitest (5.17.0) multipart-post (2.1.1) - nokogiri (1.16.2-x86_64-linux) + nokogiri (1.16.5-x86_64-linux) racc (~> 1.4) octokit (4.22.0) faraday (>= 0.9) From 6682d280fe0684646940411b555b1b190a56ba98 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 2 Aug 2024 00:01:19 +0000 Subject: [PATCH 50/55] Bump rexml from 3.2.5 to 3.3.3 Bumps [rexml](https://github.com/ruby/rexml) from 3.2.5 to 3.3.3. - [Release notes](https://github.com/ruby/rexml/releases) - [Changelog](https://github.com/ruby/rexml/blob/master/NEWS.md) - [Commits](https://github.com/ruby/rexml/compare/v3.2.5...v3.3.3) --- updated-dependencies: - dependency-name: rexml dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Gemfile.lock | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index 6194096..67d83d9 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -243,7 +243,8 @@ GEM rb-fsevent (0.11.1) rb-inotify (0.10.1) ffi (~> 1.0) - rexml (3.2.5) + rexml (3.3.3) + strscan rouge (3.26.0) ruby2_keywords (0.0.5) rubyzip (2.3.2) @@ -258,6 +259,7 @@ GEM faraday (> 0.8, < 2.0) simpleidn (0.2.1) unf (~> 0.1.4) + strscan (3.1.0) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) thread_safe (0.3.6) From 50105b265c5f6a1372eb6401a7db9d0508e475f1 Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Wed, 15 Jan 2025 15:35:54 +0000 Subject: [PATCH 51/55] Add the webinar blog post for Kuntal's and my Delta Lake presentation with slideshare embed, whee! --- .../2025-01-15-cloud-native-data-ingestion.md | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 _posts/2025-01-15-cloud-native-data-ingestion.md diff --git a/_posts/2025-01-15-cloud-native-data-ingestion.md b/_posts/2025-01-15-cloud-native-data-ingestion.md new file mode 100644 index 0000000..2df1b88 --- /dev/null +++ b/_posts/2025-01-15-cloud-native-data-ingestion.md @@ -0,0 +1,35 @@ +--- +layout: post +title: "Cloud-native Data Ingestion with AWS Aurora and Delta Lake" +team: "Core Infrastructure" +author: rtyler +tags: +- deltalake +- rust +- featured +--- + + +One of the major themes for Infrastructure Engineering over the past couple +years has been higher reliability and better operational efficiency. In a +recent session with the [Delta Lake](https://delta.io) project I was able to +share the work led Kuntal Basu and a number of other people to _dramatically_ +improve the efficiency and reliability of our online data ingestion pipeline. + + +> Join Kuntal Basu, Staff Infrastructure Engineer, and R. Tyler Croy, Principal +> Engineer at Scribd, Inc. as they take you behind the scenes of Scribd’s data +> ingestion setup. They’ll break down the architecture, explain the tools, and +> walk you through how they turned off-the-shelf solutions into a robust +> pipeline. + + +## Video + +
+ + +## Presentation + +
+ From 98cababa78942b997c31d22de87e23edec463b62 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 15 Jan 2025 15:56:07 +0000 Subject: [PATCH 52/55] Bump webrick from 1.7.0 to 1.8.2 Bumps [webrick](https://github.com/ruby/webrick) from 1.7.0 to 1.8.2. - [Release notes](https://github.com/ruby/webrick/releases) - [Commits](https://github.com/ruby/webrick/compare/v1.7.0...v1.8.2) --- updated-dependencies: - dependency-name: webrick dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index eb6e096..dd401a3 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -274,7 +274,7 @@ GEM unf_ext (0.0.8.1) unicode-display_width (1.8.0) wdm (0.1.1) - webrick (1.7.0) + webrick (1.8.2) zeitwerk (2.6.6) PLATFORMS From 067ca836e159ae6fec0d02956f3d3f2b13fd5c58 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 15 Jan 2025 15:56:07 +0000 Subject: [PATCH 53/55] Bump rexml from 3.3.3 to 3.3.9 Bumps [rexml](https://github.com/ruby/rexml) from 3.3.3 to 3.3.9. - [Release notes](https://github.com/ruby/rexml/releases) - [Changelog](https://github.com/ruby/rexml/blob/master/NEWS.md) - [Commits](https://github.com/ruby/rexml/compare/v3.3.3...v3.3.9) --- updated-dependencies: - dependency-name: rexml dependency-type: indirect ... Signed-off-by: dependabot[bot] --- Gemfile.lock | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index eb6e096..0ad34f0 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -243,8 +243,7 @@ GEM rb-fsevent (0.11.1) rb-inotify (0.10.1) ffi (~> 1.0) - rexml (3.3.3) - strscan + rexml (3.3.9) rouge (3.26.0) ruby2_keywords (0.0.5) rubyzip (2.3.2) @@ -259,7 +258,6 @@ GEM faraday (> 0.8, < 2.0) simpleidn (0.2.1) unf (~> 0.1.4) - strscan (3.1.0) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) thread_safe (0.3.6) From aab7bc951dce6313327425b5cf07d9eca941461e Mon Sep 17 00:00:00 2001 From: "oleh.motrunych" Date: Fri, 14 Mar 2025 11:30:43 +0100 Subject: [PATCH 54/55] terrafrom-oxbow module --- _posts/2025-03-14-terraform-oxbow-module.md | 61 +++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 _posts/2025-03-14-terraform-oxbow-module.md diff --git a/_posts/2025-03-14-terraform-oxbow-module.md b/_posts/2025-03-14-terraform-oxbow-module.md new file mode 100644 index 0000000..ab48af2 --- /dev/null +++ b/_posts/2025-03-14-terraform-oxbow-module.md @@ -0,0 +1,61 @@ +--- +layout: post +title: "Terraform module to manage Oxbow Lambda and its components" +tags: +- Oxbow +- Terraform +- AWS +- deltalake +- rust +team: Core Infrastructure +author: Oleh Motrunych +--- + + +[Oxbow](https://github.com/buoyant-data/oxbow) is a project to take an existing storage location which contains [Apache Parquet](https://parquet.apache.org/) files into a [Delta Lake table](https://delta.io/). +It is intended to run both as an AWS Lambda or as a command line application. +We are excited to introduce [terraform-oxbow](https://github.com/scribd/terraform-oxbow), an open-source Terraform module that simplifies the deployment and management of AWS Lambda and its supporting components. Whether you're working with AWS Glue, Kinesis Data Firehose, SQS, or DynamoDB, this module provides a streamlined approach to infrastructure as code (IaC) in AWS. + +### ✨ Why terraform-oxbow? +Managing event-driven architectures in AWS can be complex, requiring careful orchestration of multiple services. Terraform-oxbow abstracts much of this complexity, enabling users to configure key components with simple boolean flags and module parameters. This is an easy and efficient way to have Delta table created using Apache Parquet files. +### 🚀Features + +With **terraform-oxbow**, you can deploy: + +- AWS Oxbow Lambda with customizable configurations +- Kinesis Data Firehose for real-time data streaming +- SQS and SQS Dead Letter Queues for event-driven messaging +- IAM policies for secure access management +- S3 bucket notifications to trigger Lambda functions +- DynamoDB tables for data storage and locking +- AWS Glue Catalog and Tables for schema management + + +### ⚙️ How It Works + +This module follows a modular approach, allowing users to enable or disable services based on their specific use case. Here are a few examples: + +- To enable AWS Glue Catalog and Tables: ```hcl +enable_aws_glue_catalog_table = true +``` + +- To enable Kinesis Data Firehose delivery stream ```hcl +enable_kinesis_firehose_delivery_stream = true +``` + +- To enable S3 bucket notifications ```hcl +enable_bucket_notification = true +``` + +- To enable advanced Oxbow Lambda setup for multi-table filtered optimization ```hcl +enable_group_events = true +``` + +- AWS S3 bucket notifications have limitations: Due to AWS constraints, an S3 bucket can only have a single notification configuration per account. If you need to trigger multiple Lambda functions from the same S3 bucket, consider using event-driven solutions like SNS or SQS. + + +- IAM Policy Management: The module provides the necessary permissions but follows the principle of least privilege. Ensure your IAM policies align with your security requirements. + + +- Scalability and Optimization: The module allows fine-grained control over Lambda concurrency, event filtering, and data processing configurations to optimize costs and performance + From 623a948b02c8b8412ddf5f83a645d53bf87bfb53 Mon Sep 17 00:00:00 2001 From: "R. Tyler Croy" Date: Mon, 2 Jun 2025 18:19:30 +0000 Subject: [PATCH 55/55] Properly reference the jobs which are now on Ashby --- assets/js/jobs.js | 23 +++++++++++------------ careers.html | 8 ++++---- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/assets/js/jobs.js b/assets/js/jobs.js index 07d3b18..8172cb9 100644 --- a/assets/js/jobs.js +++ b/assets/js/jobs.js @@ -5,7 +5,7 @@ * * With that disclaimer out of the way... * - * This file handles the fetching of jobs from Lever such that they can be + * This file handles the fetching of jobs from Lever^WAshby such that they can be * dynamically inserted into different parts of the tech blog */ @@ -13,7 +13,7 @@ * This API will return an list of departments which must then be filtered * through to find the .postings under each */ -const API_URL = 'https://api.lever.co/v0/postings/scribd?group=department&mode=json' +const API_URL = 'https://api.ashbyhq.com/posting-api/job-board/scribd?includeCompensation=true' /* @@ -37,21 +37,20 @@ function fetchJobs() { return fetch(API_URL) .then(async (response) => { - const departments = await response.json(); + const board = await response.json(); /* * Since this is the tech blog, we're only pulling a couple of * departments */ - departments - .filter(d => ['Engineering', 'Data Science', 'Design', 'Business Analytics', 'Product'].includes(d.title)) - .forEach((department) => { - department.postings.forEach((posting) => { - const team = posting.categories.team; + board.jobs + .filter(j => ['Engineering', 'Product, Design, & Analytics', 'Product'].includes(j.department)) + .filter(j => !j.title.toLowerCase().includes('marketing')) + .forEach((job) => { + const team = job.team; if (!window.jobsCache[team]) { window.jobsCache[team] = []; } - window.jobsCache[team].push(posting); - }); + window.jobsCache[team].push(job); }); window.jobsFetched = true; return window.jobsCache; @@ -98,9 +97,9 @@ function renderJobs(elem, team, randomLimit) { li.innerHTML = `
- ${job.text} + ${job.title}
-

${job.categories.location || ''}

+

${job.location || ''}

`; elem.appendChild(li); diff --git a/careers.html b/careers.html index 44d5367..5a5072a 100644 --- a/careers.html +++ b/careers.html @@ -159,14 +159,14 @@

${team} li.innerHTML = `
- - ${job.text} + + ${job.title}
-

${job.categories.location || ''}

+

${job.location || ''}

- `;