diff --git a/Gemfile b/Gemfile index fd51732..f0f42ae 100644 --- a/Gemfile +++ b/Gemfile @@ -1,18 +1,21 @@ source "https://rubygems.org/" -gem "jekyll", "~> 3.8.5" +#gem "jekyll", "~> 4.0" gem 'kramdown' gem 'rouge' # If you want to use GitHub Pages, remove the "gem "jekyll"" above and # uncomment the line below. To upgrade, run `bundle update github-pages`. -gem "github-pages", "~> 201", group: :jekyll_plugins +gem "github-pages", group: :jekyll_plugins # If you have any plugins, put them here! group :jekyll_plugins do - gem "jekyll-feed", "~> 0.11" + gem "jekyll-feed" + gem 'jekyll-paginate' end +gem 'webrick' + # Windows and JRuby does not include zoneinfo files, so bundle the tzinfo-data gem # and associated library. install_if -> { RUBY_PLATFORM =~ %r!mingw|mswin|java! } do diff --git a/Gemfile.lock b/Gemfile.lock index bdeb413..9859e75 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,228 +1,252 @@ GEM remote: https://rubygems.org/ specs: - activesupport (4.2.11.1) - i18n (~> 0.7) + activesupport (6.0.6.1) + concurrent-ruby (~> 1.0, >= 1.0.2) + i18n (>= 0.7, < 2) minitest (~> 5.1) - thread_safe (~> 0.3, >= 0.3.4) tzinfo (~> 1.1) - addressable (2.7.0) + zeitwerk (~> 2.2, >= 2.2.2) + addressable (2.8.0) public_suffix (>= 2.0.2, < 5.0) coffee-script (2.4.1) coffee-script-source execjs coffee-script-source (1.11.1) colorator (1.1.0) - commonmarker (0.17.13) - ruby-enum (~> 0.5) - concurrent-ruby (1.1.5) - dnsruby (1.61.3) - addressable (~> 2.5) - em-websocket (0.5.1) + commonmarker (0.23.10) + concurrent-ruby (1.2.0) + dnsruby (1.61.9) + simpleidn (~> 0.1) + em-websocket (0.5.3) eventmachine (>= 0.12.9) - http_parser.rb (~> 0.6.0) - ethon (0.12.0) - ffi (>= 1.3.0) + http_parser.rb (~> 0) + ethon (0.15.0) + ffi (>= 1.15.0) eventmachine (1.2.7) - execjs (2.7.0) - faraday (0.17.0) + execjs (2.8.1) + faraday (1.10.0) + faraday-em_http (~> 1.0) + faraday-em_synchrony (~> 1.0) + faraday-excon (~> 1.1) + faraday-httpclient (~> 1.0) + faraday-multipart (~> 1.0) + faraday-net_http (~> 1.0) + faraday-net_http_persistent (~> 1.0) + faraday-patron (~> 1.0) + faraday-rack (~> 1.0) + faraday-retry (~> 1.0) + ruby2_keywords (>= 0.0.4) + faraday-em_http (1.0.0) + faraday-em_synchrony (1.0.0) + faraday-excon (1.1.0) + faraday-httpclient (1.0.1) + faraday-multipart (1.0.3) multipart-post (>= 1.2, < 3) - ffi (1.11.1) + faraday-net_http (1.0.1) + faraday-net_http_persistent (1.2.0) + faraday-patron (1.0.0) + faraday-rack (1.0.0) + faraday-retry (1.0.3) + ffi (1.15.5) forwardable-extended (2.6.0) gemoji (3.0.1) - github-pages (201) - activesupport (= 4.2.11.1) - github-pages-health-check (= 1.16.1) - jekyll (= 3.8.5) - jekyll-avatar (= 0.6.0) + github-pages (226) + github-pages-health-check (= 1.17.9) + jekyll (= 3.9.2) + jekyll-avatar (= 0.7.0) jekyll-coffeescript (= 1.1.1) - jekyll-commonmark-ghpages (= 0.1.6) + jekyll-commonmark-ghpages (= 0.2.0) jekyll-default-layout (= 0.1.4) - jekyll-feed (= 0.11.0) + jekyll-feed (= 0.15.1) jekyll-gist (= 1.5.0) - jekyll-github-metadata (= 2.12.1) - jekyll-mentions (= 1.4.1) - jekyll-optional-front-matter (= 0.3.0) + jekyll-github-metadata (= 2.13.0) + jekyll-include-cache (= 0.2.1) + jekyll-mentions (= 1.6.0) + jekyll-optional-front-matter (= 0.3.2) jekyll-paginate (= 1.1.0) - jekyll-readme-index (= 0.2.0) - jekyll-redirect-from (= 0.14.0) - jekyll-relative-links (= 0.6.0) - jekyll-remote-theme (= 0.4.0) + jekyll-readme-index (= 0.3.0) + jekyll-redirect-from (= 0.16.0) + jekyll-relative-links (= 0.6.1) + jekyll-remote-theme (= 0.4.3) jekyll-sass-converter (= 1.5.2) - jekyll-seo-tag (= 2.5.0) - jekyll-sitemap (= 1.2.0) - jekyll-swiss (= 0.4.0) - jekyll-theme-architect (= 0.1.1) - jekyll-theme-cayman (= 0.1.1) - jekyll-theme-dinky (= 0.1.1) - jekyll-theme-hacker (= 0.1.1) - jekyll-theme-leap-day (= 0.1.1) - jekyll-theme-merlot (= 0.1.1) - jekyll-theme-midnight (= 0.1.1) - jekyll-theme-minimal (= 0.1.1) - jekyll-theme-modernist (= 0.1.1) - jekyll-theme-primer (= 0.5.3) - jekyll-theme-slate (= 0.1.1) - jekyll-theme-tactile (= 0.1.1) - jekyll-theme-time-machine (= 0.1.1) - jekyll-titles-from-headings (= 0.5.1) - jemoji (= 0.10.2) - kramdown (= 1.17.0) - liquid (= 4.0.0) - listen (= 3.1.5) + jekyll-seo-tag (= 2.8.0) + jekyll-sitemap (= 1.4.0) + jekyll-swiss (= 1.0.0) + jekyll-theme-architect (= 0.2.0) + jekyll-theme-cayman (= 0.2.0) + jekyll-theme-dinky (= 0.2.0) + jekyll-theme-hacker (= 0.2.0) + jekyll-theme-leap-day (= 0.2.0) + jekyll-theme-merlot (= 0.2.0) + jekyll-theme-midnight (= 0.2.0) + jekyll-theme-minimal (= 0.2.0) + jekyll-theme-modernist (= 0.2.0) + jekyll-theme-primer (= 0.6.0) + jekyll-theme-slate (= 0.2.0) + jekyll-theme-tactile (= 0.2.0) + jekyll-theme-time-machine (= 0.2.0) + jekyll-titles-from-headings (= 0.5.3) + jemoji (= 0.12.0) + kramdown (= 2.3.2) + kramdown-parser-gfm (= 1.1.0) + liquid (= 4.0.3) mercenary (~> 0.3) - minima (= 2.5.0) - nokogiri (>= 1.10.4, < 2.0) - rouge (= 3.11.0) + minima (= 2.5.1) + nokogiri (>= 1.13.4, < 2.0) + rouge (= 3.26.0) terminal-table (~> 1.4) - github-pages-health-check (1.16.1) + github-pages-health-check (1.17.9) addressable (~> 2.3) dnsruby (~> 1.60) octokit (~> 4.0) - public_suffix (~> 3.0) + public_suffix (>= 3.0, < 5.0) typhoeus (~> 1.3) - html-pipeline (2.12.0) + html-pipeline (2.14.1) activesupport (>= 2) nokogiri (>= 1.4) - http_parser.rb (0.6.0) + http_parser.rb (0.8.0) i18n (0.9.5) concurrent-ruby (~> 1.0) - jekyll (3.8.5) + jekyll (3.9.2) addressable (~> 2.4) colorator (~> 1.0) em-websocket (~> 0.5) i18n (~> 0.7) jekyll-sass-converter (~> 1.0) jekyll-watch (~> 2.0) - kramdown (~> 1.14) + kramdown (>= 1.17, < 3) liquid (~> 4.0) mercenary (~> 0.3.3) pathutil (~> 0.9) rouge (>= 1.7, < 4) safe_yaml (~> 1.0) - jekyll-avatar (0.6.0) - jekyll (~> 3.0) + jekyll-avatar (0.7.0) + jekyll (>= 3.0, < 5.0) jekyll-coffeescript (1.1.1) coffee-script (~> 2.2) coffee-script-source (~> 1.11.1) - jekyll-commonmark (1.3.1) - commonmarker (~> 0.14) - jekyll (>= 3.7, < 5.0) - jekyll-commonmark-ghpages (0.1.6) - commonmarker (~> 0.17.6) - jekyll-commonmark (~> 1.2) + jekyll-commonmark (1.4.0) + commonmarker (~> 0.22) + jekyll-commonmark-ghpages (0.2.0) + commonmarker (~> 0.23.4) + jekyll (~> 3.9.0) + jekyll-commonmark (~> 1.4.0) rouge (>= 2.0, < 4.0) jekyll-default-layout (0.1.4) jekyll (~> 3.0) - jekyll-feed (0.11.0) - jekyll (~> 3.3) + jekyll-feed (0.15.1) + jekyll (>= 3.7, < 5.0) jekyll-gist (1.5.0) octokit (~> 4.2) - jekyll-github-metadata (2.12.1) - jekyll (~> 3.4) + jekyll-github-metadata (2.13.0) + jekyll (>= 3.4, < 5.0) octokit (~> 4.0, != 4.4.0) - jekyll-mentions (1.4.1) + jekyll-include-cache (0.2.1) + jekyll (>= 3.7, < 5.0) + jekyll-mentions (1.6.0) html-pipeline (~> 2.3) - jekyll (~> 3.0) - jekyll-optional-front-matter (0.3.0) - jekyll (~> 3.0) + jekyll (>= 3.7, < 5.0) + jekyll-optional-front-matter (0.3.2) + jekyll (>= 3.0, < 5.0) jekyll-paginate (1.1.0) - jekyll-readme-index (0.2.0) - jekyll (~> 3.0) - jekyll-redirect-from (0.14.0) - jekyll (~> 3.3) - jekyll-relative-links (0.6.0) - jekyll (~> 3.3) - jekyll-remote-theme (0.4.0) + jekyll-readme-index (0.3.0) + jekyll (>= 3.0, < 5.0) + jekyll-redirect-from (0.16.0) + jekyll (>= 3.3, < 5.0) + jekyll-relative-links (0.6.1) + jekyll (>= 3.3, < 5.0) + jekyll-remote-theme (0.4.3) addressable (~> 2.0) - jekyll (~> 3.5) - rubyzip (>= 1.2.1, < 3.0) + jekyll (>= 3.5, < 5.0) + jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0) + rubyzip (>= 1.3.0, < 3.0) jekyll-sass-converter (1.5.2) sass (~> 3.4) - jekyll-seo-tag (2.5.0) - jekyll (~> 3.3) - jekyll-sitemap (1.2.0) - jekyll (~> 3.3) - jekyll-swiss (0.4.0) - jekyll-theme-architect (0.1.1) - jekyll (~> 3.5) + jekyll-seo-tag (2.8.0) + jekyll (>= 3.8, < 5.0) + jekyll-sitemap (1.4.0) + jekyll (>= 3.7, < 5.0) + jekyll-swiss (1.0.0) + jekyll-theme-architect (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-cayman (0.1.1) - jekyll (~> 3.5) + jekyll-theme-cayman (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-dinky (0.1.1) - jekyll (~> 3.5) + jekyll-theme-dinky (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-hacker (0.1.1) - jekyll (~> 3.5) + jekyll-theme-hacker (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-leap-day (0.1.1) - jekyll (~> 3.5) + jekyll-theme-leap-day (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-merlot (0.1.1) - jekyll (~> 3.5) + jekyll-theme-merlot (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-midnight (0.1.1) - jekyll (~> 3.5) + jekyll-theme-midnight (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-minimal (0.1.1) - jekyll (~> 3.5) + jekyll-theme-minimal (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-modernist (0.1.1) - jekyll (~> 3.5) + jekyll-theme-modernist (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-primer (0.5.3) - jekyll (~> 3.5) + jekyll-theme-primer (0.6.0) + jekyll (> 3.5, < 5.0) jekyll-github-metadata (~> 2.9) jekyll-seo-tag (~> 2.0) - jekyll-theme-slate (0.1.1) - jekyll (~> 3.5) + jekyll-theme-slate (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-tactile (0.1.1) - jekyll (~> 3.5) + jekyll-theme-tactile (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-time-machine (0.1.1) - jekyll (~> 3.5) + jekyll-theme-time-machine (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-titles-from-headings (0.5.1) - jekyll (~> 3.3) + jekyll-titles-from-headings (0.5.3) + jekyll (>= 3.3, < 5.0) jekyll-watch (2.2.1) listen (~> 3.0) - jemoji (0.10.2) + jemoji (0.12.0) gemoji (~> 3.0) html-pipeline (~> 2.2) - jekyll (~> 3.0) - kramdown (1.17.0) - liquid (4.0.0) - listen (3.1.5) - rb-fsevent (~> 0.9, >= 0.9.4) - rb-inotify (~> 0.9, >= 0.9.7) - ruby_dep (~> 1.2) + jekyll (>= 3.0, < 5.0) + kramdown (2.3.2) + rexml + kramdown-parser-gfm (1.1.0) + kramdown (~> 2.0) + liquid (4.0.3) + listen (3.7.1) + rb-fsevent (~> 0.10, >= 0.10.3) + rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.3.6) - mini_portile2 (2.5.1) - minima (2.5.0) - jekyll (~> 3.5) + minima (2.5.1) + jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) - minitest (5.12.2) + minitest (5.17.0) multipart-post (2.1.1) - nokogiri (1.11.5) - mini_portile2 (~> 2.5.0) + nokogiri (1.16.5-x86_64-linux) racc (~> 1.4) - octokit (4.14.0) + octokit (4.22.0) + faraday (>= 0.9) sawyer (~> 0.8.0, >= 0.5.3) pathutil (0.16.2) forwardable-extended (~> 2.6) - public_suffix (3.1.1) - racc (1.5.2) - rb-fsevent (0.10.3) - rb-inotify (0.10.0) + public_suffix (4.0.7) + racc (1.7.3) + rb-fsevent (0.11.1) + rb-inotify (0.10.1) ffi (~> 1.0) - rouge (3.11.0) - ruby-enum (0.7.2) - i18n - ruby_dep (1.5.0) - rubyzip (2.0.0) + rexml (3.3.9) + rouge (3.26.0) + ruby2_keywords (0.0.5) + rubyzip (2.3.2) safe_yaml (1.0.5) sass (3.7.4) sass-listen (~> 4.0.0) @@ -232,30 +256,38 @@ GEM sawyer (0.8.2) addressable (>= 2.3.5) faraday (> 0.8, < 2.0) + simpleidn (0.2.1) + unf (~> 0.1.4) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) thread_safe (0.3.6) - typhoeus (1.3.1) + typhoeus (1.4.0) ethon (>= 0.9.0) - tzinfo (1.2.5) + tzinfo (1.2.10) thread_safe (~> 0.1) - tzinfo-data (1.2019.3) + tzinfo-data (1.2022.1) tzinfo (>= 1.0.0) - unicode-display_width (1.6.0) + unf (0.1.4) + unf_ext + unf_ext (0.0.8.1) + unicode-display_width (1.8.0) wdm (0.1.1) + webrick (1.8.2) + zeitwerk (2.6.6) PLATFORMS - ruby + x86_64-linux DEPENDENCIES - github-pages (~> 201) - jekyll (~> 3.8.5) - jekyll-feed (~> 0.11) + github-pages + jekyll-feed + jekyll-paginate kramdown rouge tzinfo (~> 1.2) tzinfo-data wdm (~> 0.1.1) + webrick BUNDLED WITH - 2.0.2 + 2.3.8 diff --git a/_category/data-science.md b/_category/data-science.md index aa400d2..073e6db 100644 --- a/_category/data-science.md +++ b/_category/data-science.md @@ -1,4 +1,4 @@ --- -team: Data Science +team: Applied Research permalink: "/blog/category/data-science" --- diff --git a/_config.yml b/_config.yml index 6c2231a..3e577cc 100644 --- a/_config.yml +++ b/_config.yml @@ -7,7 +7,7 @@ description: >- # this means to ignore newlines until "baseurl:" baseurl: "" # the subpath of your site, e.g. /blog url: "https://tech.scribd.com/" # the base hostname & protocol for your site, e.g. http://example.com google_analytics: 'UA-443684-30' -featured_series: 'airflow-series' +featured_series: 'kyc-series' # GitHub Metadata # Used for "improve this page" link diff --git a/_data/authors.yml b/_data/authors.yml index 684e972..69d8120 100644 --- a/_data/authors.yml +++ b/_data/authors.yml @@ -3,6 +3,13 @@ # description, etc --- +bshaw: + name: Ben Shaw + github: benshaw + twitter: ben_a_shaw + about: | + Ben leads the ML Platform group, helping scale production Machine Learning at scribd. Other times you will find him outside playing in the mountains. + alexjb: name: Alex Bernardin github: alexofmanytrades @@ -136,4 +143,37 @@ div: gregr: name: Greg Reznik github: imfromthebay + + +jonathanr: + name: Jonathan Ramkissoon + twitter: _JRamkissoon + github: jramkiss + blog: https://jramkiss.github.io/ + about: | + Jonathan is a data scientist on the Applied Research team building machine learning models to understand and connect our content. + +antoniam: + name: Antonia Mouawad + github: AntoniaMouawad + about: | + Antonia is a data scientist on the Applied Research team building machine learning models to understand and connect our content. +nathans: + name: Nathan Sass + github: NathanSass + about: | + Nathan is a software engineer on the Android platform team. + +rafaelp: + name: Rafael Lacerda + github: lacerda + blog: https://blog.lacerda.ch/ + about: | + Rafael is a data scientist on the Applied Research team building machine learning models to understand and connect our content. + +moniquec: + name: Monique Alves Cruz + github: MAlvesCruz + about: | + Monique is a data scientist on the Applied Research team building machine learning models to understand and connect our content. diff --git a/_data/team-structure.yml b/_data/team-structure.yml index 86950f5..90f26e4 100644 --- a/_data/team-structure.yml +++ b/_data/team-structure.yml @@ -26,9 +26,9 @@ about titles in our library by analyzing content and user behavior and building predictive models. -- team: Data Science +- team: Applied Research description: | - The Data Science team drives decisions by creating insights into the product + The Applied Research team drives decisions by creating insights into the product and improve the user experience with machine learning. - team: Core Platform @@ -75,7 +75,7 @@ - team: iOS description: | The iOS team's mission is to deliver a performant, stable and feature-rich - Android application. + iOS application. - team: Web Development description: | @@ -91,3 +91,8 @@ description: | The Web QA team strives for a defect-free Scribd website known for its reliability. + +- team: Service Foundations + description: | + The Service Foudations team provides reliable, high-quality, scalable service foundations + that teams can leverage to easily build, deploy and monitor self-owned, distributed services. diff --git a/_data/teams.yml b/_data/teams.yml index df9870e..56a4665 100644 --- a/_data/teams.yml +++ b/_data/teams.yml @@ -8,6 +8,9 @@ iOS: Android: lever: 'Mobile' +Applied Research: + lever: 'Data Science' + Data Science: lever: 'Data Science' diff --git a/_includes/team-color-logic.html b/_includes/team-color-logic.html index 76dac8b..e87eb10 100644 --- a/_includes/team-color-logic.html +++ b/_includes/team-color-logic.html @@ -9,7 +9,7 @@ {% elsif page.team == "Android" %} {% assign theme = 'grass' %} - {% elsif page.team == "Data Science" %} + {% elsif page.team == "Applied Research" %} {% assign theme = 'slate' %} {% elsif page.team == "Web Development" %} diff --git a/_layouts/home.html b/_layouts/home.html index b92aa95..3439c1f 100644 --- a/_layouts/home.html +++ b/_layouts/home.html @@ -21,7 +21,7 @@

Help us build our next project.

-

We're on a mission to change the way the world reads. That's an ambitious task, and we need ambitious people to get us there. See what positions are available and start your next chapter today.

+

We're on a mission to build the largest and most accessible library connecting storytellers with their audience. That's an ambitious task, and we need ambitious people to get us there. See what positions are available and start your next chapter today.

All Positions diff --git a/_posts/2018-01-05-neural-spelling-corrections.md b/_posts/2018-01-05-neural-spelling-corrections.md index 46205af..9bbbc99 100644 --- a/_posts/2018-01-05-neural-spelling-corrections.md +++ b/_posts/2018-01-05-neural-spelling-corrections.md @@ -5,7 +5,7 @@ author: mattr tags: - seq2seq - data -team: Data Science +team: Applied Research --- Introduction diff --git a/_posts/2018-02-12-search-query-parsing.md b/_posts/2018-02-12-search-query-parsing.md index c3937e9..49e4498 100644 --- a/_posts/2018-02-12-search-query-parsing.md +++ b/_posts/2018-02-12-search-query-parsing.md @@ -5,7 +5,7 @@ author: mattr tags: - search - data -team: Data Science +team: Applied Research --- Scribd has a variety of content to offer and connecting our users with their desired content is a crucial aspect of our product. One of the main ways that users find content on Scribd is through search, and in this post I want to delve into an analysis we did regarding parsing out valuable information from a user’s query in order to better serve them relevant results, and also learn more about what they are searching for. diff --git a/_posts/2018-03-20-scribds-ab-testing.md b/_posts/2018-03-20-scribds-ab-testing.md index 2b3e3ad..59a21d0 100644 --- a/_posts/2018-03-20-scribds-ab-testing.md +++ b/_posts/2018-03-20-scribds-ab-testing.md @@ -5,7 +5,7 @@ author: dfeldman tags: - testing - data -team: Data Science +team: Applied Research --- What is A/B testing? diff --git a/_posts/2018-04-18-bandits-for-the-win.md b/_posts/2018-04-18-bandits-for-the-win.md index d0a0734..a70db49 100644 --- a/_posts/2018-04-18-bandits-for-the-win.md +++ b/_posts/2018-04-18-bandits-for-the-win.md @@ -5,7 +5,7 @@ author: dfeldman tags: - testing - data -team: Data Science +team: Applied Research --- We love A/B testing at Scribd. What follows is a specific example to give you an inside look at the process from idea to implementation for an algorithm test. diff --git a/_posts/2018-05-31-non-random-seo-test.md b/_posts/2018-05-31-non-random-seo-test.md index b115427..262d008 100644 --- a/_posts/2018-05-31-non-random-seo-test.md +++ b/_posts/2018-05-31-non-random-seo-test.md @@ -6,7 +6,7 @@ tags: - seo - testing - data -team: Data Science +team: Applied Research --- Months ago, your friends convinced you to sign up for a half marathon. With three weeks to go, you haven’t even started training. In a growing panic, you turn to the internet for answers. diff --git a/_posts/2019-02-07-calculating-customer-lifetime-revenue.md b/_posts/2019-02-07-calculating-customer-lifetime-revenue.md index cbb708a..d4c679e 100644 --- a/_posts/2019-02-07-calculating-customer-lifetime-revenue.md +++ b/_posts/2019-02-07-calculating-customer-lifetime-revenue.md @@ -5,7 +5,7 @@ author: bclearly tags: - ltr - data -team: Data Science +team: Applied Research --- Why LTR? (Lifetime Revenue) diff --git a/_posts/2019-03-04-experiments-with-seq2seq.md b/_posts/2019-03-04-experiments-with-seq2seq.md index 8f3beac..ff10bb4 100644 --- a/_posts/2019-03-04-experiments-with-seq2seq.md +++ b/_posts/2019-03-04-experiments-with-seq2seq.md @@ -6,7 +6,7 @@ tags: - machinelearning - seq2seq - data -team: Data Science +team: Applied Research --- How much data do you need to train a seq2seq model? Let’s say that you want to translate sentences from one language to another. You probably need a bigger dataset to translate longer sentences than if you wanted to translate shorter ones. How does the need for data grow as the sentence length increases? diff --git a/_posts/2020-04-29-monitoring-aws-with-panther.md b/_posts/2020-04-29-monitoring-aws-with-panther.md index 6866b4d..da5ceec 100644 --- a/_posts/2020-04-29-monitoring-aws-with-panther.md +++ b/_posts/2020-04-29-monitoring-aws-with-panther.md @@ -5,10 +5,13 @@ tags: - monitoring - aws - featured +- archived team: Security Engineering author: paha --- +***NOTE***: *Scribd’s security infrastructure has since evolved away from using Panther* + Before widespread cloud usage, it was uncommon for one person to be present for the entire datacenter development lifecycle. Very few people knew how to design and build a datacenter from scratch while ensuring appropriate security configuration settings were set, on top of rigging up monitoring. It was even more uncommon for non-sysadmins to have any involvement in data center infrastructure construction or ongoing refinement. The cloud is very different. It only takes seconds to create an entire infrastructure from a template. And even developers are doing it! The monitoring challenges for such a scenario are significant. There aren't necessarily "more" monitoring data points, but the speed with which infrastructure can be created tends to result in infrastructure getting way out over its skis with respect to monitoring. Furthermore, since many barriers to entry for doing stupid things have been lowered to the point of non-existence, monitoring is the last great hope of maintaining control over a cloud environment. While access controls can still provide some guardrails, the flexibility that all engineers need to do their jobs requires that they have the ability to do "dangerous" things that they've never had to do before. The true definition of "full stack" has expanded. diff --git a/_posts/2021-07-08-automate-databricks-with-terraform.md b/_posts/2021-07-08-automate-databricks-with-terraform.md new file mode 100644 index 0000000..f995b0c --- /dev/null +++ b/_posts/2021-07-08-automate-databricks-with-terraform.md @@ -0,0 +1,51 @@ +--- +layout: post +title: "Automating Databricks with Terraform" +team: Core Platform +author: rtyler +tags: +- databricks +- terraform +- featured +--- + +The long term success of our data platform relies on putting tools into the +hands of developers and data scientists to “choose their own adventure”. A big +part of that story has been [Databricks](https://databricks.com) which we +recently integrated with [Terraform](https://terraform.io) to make it easy to +scale a top-notch developer experience. At the 2021 Data and AI Summit, Core +Platform infrastructure engineer [Hamilton +Hord](https://github.com/HamiltonHord) and Databricks engineer [Serge +Smertin](https://github.com/nfx) presented on the Databricks terraform provider +and how it's been used by Scribd. + +In the session embedded below, they share the details on the [Databricks (Labs) +Terraform +integration](https://github.com/databrickslabs/terraform-provider-databricks) +and how it can automate literally every aspect required for a production-grade +platform: data security, permissions, continuous deployment and so on. They +also discuss the ways in which our Core Platform team enables internal +customers without acting as gatekeepers for data platform changes. Just about +anything they might need in Databricks is a pull request away! + +
+ +
+ + +In hindsight, it's mind-boggling how much manual configuration we had to +previously maintain. With the Terraform provider for Databricks we can very +easily test, reproduce, and audit hundreds of different business critical +Databricks resources. Coupling Terraform with the recent "multi-workspace" +support that Databricks unveiled in 2020 means we can also now provision an +entirely new environment in a few hours! + +Investing in data platform tools and automation is a key part of the vision for +Platform Engineering which encompasses Data Engineering, Data Operations, and +Core Platform. We have a [number of open positions](/careers/#open-positions) +at the moment, but I wanted to call special attention to the [Data Engineering +Manager](https://jobs.lever.co/scribd/e2187c1c-a1d6-4b77-bde6-acc997f68156) +role for which we're currently hiring. The leader of the Data Engineering team +will help deliver data tools and solutions for internal customers building on +top of Delta Lake, Databricks, Airflow, and Kafka. Suffice it to say, there's a +lot of really interesting work to be done! diff --git a/_posts/2021-07-12-identifying-document-types.md b/_posts/2021-07-12-identifying-document-types.md new file mode 100644 index 0000000..b8f8fed --- /dev/null +++ b/_posts/2021-07-12-identifying-document-types.md @@ -0,0 +1,131 @@ +--- +layout: post +title: "Identifying Document Types at Scribd" +tags: +- machinelearning +- data +- featured +- kyc-series +team: Applied Research +author: jonathanr +--- + + +User-uploaded documents have been a core component of Scribd’s business from +the very beginning, understanding what is _actually_ in the document corpus +unlocks exciting new opportunities for discovery and recommendation. +With Scribd anybody can [upload and share +documents](https://www.scribd.com/docs), analogous to YouTube and videos. Over +the years, our document corpus has become larger and more diverse which has +made understanding it an ever-increasing challenge. +Over the past year one of the missions of the Applied Research team has been to +extract key document metadata to enrich +downstream discovery systems. Our approach combines semantic understanding with +user behaviour in a multi-component machine learning system. + +This is part 1 in a series of blog posts explaining the challenges and +solutions explored while building this system. This post presents the +limitations, challenges, and solutions encountered when developing a model to +classify arbitrary user-uploaded documents. + + +## Initial Constraints + +The document corpus at Scribd stretches far and wide in terms of content, language and structure. An arbitrary document can be anything from math homework to Philippine law to engineering schematics. In the first stage of the document understanding system, we want to exploit visual cues in the documents. Any model used here must be language-agnostic to apply to arbitrary documents. This is analogous to a “first glance” from humans, where we can quickly distinguish a comic book from a business report without having to read any text. To satisfy these requirements, we use a computer vision model to predict the document type. But what is a “type”? + + + +## Identifying Document Types + +A necessary question to ask, but a difficult one to answer –  what kind of documents do we have? As mentioned in the section above, we’re interested in differentiating documents based on visual cues, such as text-heavy versus spreadsheet versus comics. We’re not yet interested in more granular information like fiction VS non-fiction. + +Our approach to this challenge was twofold. Firstly, talking to subject matter experts at Scribd on the kinds of documents they have seen in the corpus. This was and continues to be very informative, as they have domain-specific knowledge that we leverage with machine learning. The second solution was to use a data-driven method to explore documents. This consisted of creating embeddings for documents based on their usage. Clustering and plotting these embeddings on an interactive map allowed us to examine document structure in different clusters. Combining these two methods drove the definition of document types. Below is an example of one of these maps we used to explore the corpus. + + + + + +
+ Map of the document corpus, built from user-interaction embeddings +
Figure 1: Map of the document corpus built from user-interaction embeddings. More on this method in a future post.
+
+
+ +We converged on 6 document types, which included sheet-music, text-heavy, comics and tables. More importantly, these 6 classes don’t account for every single document in our corpus. While there are many different ways of dealing with out-of-distribution examples in the literature, our approach explicitly added an “other” class to the model and train it. We talk more about its intuition, potential solutions to the problem and challenges faced in the coming sections. + + +## Document Classification + +As mentioned in the introduction, we need an approach that is language and content agnostic, meaning that the same model will be appropriate for all documents, whether they contain images, text, or a combination of both. To satisfy these constraints we use a computer vision model to classify individual pages. These predictions can then be combined with other meta-data such as page count or word count to form a prediction for the entire document. + + +### Gathering Labelled Pages and Documents + +Before the model training started, we faced an interesting data gathering problem. Our goal is to classify documents, so we must gather labelled documents. However, in order to train the page classifier mentioned above, we must also gather labelled pages. Naively, it might seem appropriate to gather labelled documents and use the document label for each of its pages. This isn't appropriate as a single document can contain multiple types of pages. As an example, consider the pages in this document. + + +
+ Three pages from the same document +
Figure 2: Three different pages from the same document to demonstrate why we can't take the document label and assign it to each page.
+
+ + +The first and third pages can be considered text-heavy, but definitely not the second. Taking all the pages of this document and labelling them as text-heavy would severely pollute our training and testing data. The same logic applies to each of our 6 classes. + +To circumvent this challenge, we took an active learning approach to data gathering. We started with a small set of hand-labelled pages for each class and trained binary classifiers iteratively. The binary classification problem is simpler than the multi-class problem, requiring less hand-labelled data to obtain reliable results. At each iteration, we evaluated the most confident and least confident predictions of the model to get a sense of its inductive biases. Judging from these, we supplemented the training data for the next iteration to tweak the inductive biases and have confidence in the resulting model and labels. The sheet music class is a prime example of tweaking inductive biases. Below is an example of a page that can cause a sheet music misclassification if the model learns that sheet music is any page with horizontal lines. Supplementing the training data at each iteration helps get rid of inductive biases like this. + + +
+ Example of possible sheet music misclassification from wrong inductive bias +
Figure 3: Example of possible sheet music misclassification due to wrong inductive biases.
+
+ +After creating these binary classifiers for each class, we have a large set of reliable labels and classifiers that can be used to gather more data if necessary. + + +### Building a Page Classifier + +The page classification problem is very similar to ImageNet classification, so we can leverage pre-trained ImageNet models. We used transfer learning in [fast.ai](https://www.fast.ai/) and [PyTorch](https://pytorch.org/) to fine-tune pre-trained computer vision architectures for the page-classifier. After initial experiments, it was clear that models with very high ImageNet accuracy, such as EfficientNet, did not perform much better on our dataset. While it’s difficult to pinpoint exactly why this is the case, we believe it is because of the nature of the classification task, the page resolutions and our data. + +We found [SqueezeNet](https://arxiv.org/pdf/1602.07360.pdf), a relatively established lightweight architecture, to be the best balance between accuracy and inference time. Because models such as ResNets and DenseNets are so large, they take a lot of time to train and iterate on. However, SqueezeNet is an order of magnitude smaller than these models, which opens up more possibilities in our training scheme. Now we can train the entire model and are not limited to using the pre-trained architecture as a feature-extractor, which is the case for larger models. + + +
+ Figure 4: SqueezeNet architectures taken from the paper. Left: SqueezeNet; Middle: SqueezeNet with simple bypass; Right: SqueezeNet with complex bypass. +
Figure 4: SqueezeNet architectures taken from the paper. Left: SqueezeNet; Middle: SqueezeNet with simple bypass; Right: SqueezeNet with complex bypass.
+
+ + +Additionally, for this particular model, low inference time is key in order to run it on hundreds of millions of documents. Inference time is also directly tied to costs, so an optimal cost/benefit ratio would require significantly higher performance to justify higher processing time. + + +### Ensembled Pages for Document Classification + +We now have a model to classify document pages and need to use them to determine a prediction for documents and want to combine these classifications with additional meta-data, such as total page count, page dimensions, etc. However, our experiments here showed that a simple ensemble of the page classifications provided an extremely strong baseline that was difficult to beat with meta-data. + +To increase efficiency, we sample 4 pages from the document to ensemble. This way we don’t run into processing issues for documents with thousands of pages. This was chosen based on the performance of the classifier and the page distribution in the document corpus, which empirically verified our assumption that this sample size reasonable represents each document. + + +### Error Analysis and Overconfidence + +After error analysis of a large sample of documents from production, we found that some classes were returning overconfident but wrong predictions. This is a very interesting challenge and one that has seen an explosion of academic research recently. To elaborate, we found documents that were predicted wrongly with over 99% confidence scores. A major consequence of this is that it negates the effectiveness of setting a threshold on model output in order to increase precision. + +While there are different ways of dealing with this, our approach involved two steps. Firstly, we utilized the “other” class mentioned earlier. By adding many of these adversarial, out-of-distribution examples to the “other” class and re-training the model, we were able to quickly improve metrics without changing model architecture. Secondly, this affected some classes more than others. For these, individual binary classifiers were built to improve precision. + +### Where do we go from here? + +
+ Figure 5: Diagram of the overall document understanding system. The red box is what we talked about in this post +
Figure 5: Diagram of the overall document understanding system. The red box is what we talked about in this post
+
+ + +Now that we have a model to filter documents based on visual cues, we can build dedicated information extraction models for each document type – sheet music, text-heavy, comics, tables. This is exactly how we proceed from here, and we start with extracting information from text-heavy documents. + +[Part 2](/blog/2021/information-extraction-at-scribd.html) in this series will dive deeper into the challenges and solutions our +team encountered while building these models. If you're interested to learn more about the problems Applied Research is solving or the systems which are built around those solutions, check out [our open positions!](/careers/#open-positions) + + +## References + +- [SqueezeNet: AlexNet-Level Accuracy with 50X Fewer Parameters and <0.5MB Model Size](https://arxiv.org/pdf/1602.07360.pdf) diff --git a/_posts/2021-07-20-growing-delta-ecosystem-with-rust.md b/_posts/2021-07-20-growing-delta-ecosystem-with-rust.md new file mode 100644 index 0000000..d476d5e --- /dev/null +++ b/_posts/2021-07-20-growing-delta-ecosystem-with-rust.md @@ -0,0 +1,46 @@ +--- +layout: post +title: "Presenting Rust and Python Support for Delta Lake" +tags: +- deltalake +- databricks +- featured +- rust +author: rtyler +team: Core Platform +--- + +Delta Lake is integral to our data platform which is why we have invested +heavily in [delta-rs](https://github.com/delta-io/delta-rs) to support our +non-JVM Delta Lake needs. This year I had the opportunity to share the progress +of delta-rs at Data and AI Summit. Delta-rs was originally started by my colleague [QP](https://github.com/houqp) just over a year ago and it has now grown to now a multi-company project with numerous contributors, and downstream projects such as [kafka-delta-ingest](/blog/2021/kafka-delta-ingest.html). + + + +In the session embedded below, I introduce the delta-rs project which is +helping bring the power of Delta Lake outside of the Spark ecosystem. By +providing a foundational Delta Lake library in Rust, delta-rs can enable native +bindings in Python, Ruby, Golang, and more.We will review what functionality +delta-rs supports in its current Rust and Python APIs and the upcoming roadmap. + +I also try to give an overview of one of the first projects to use it in +production: +[kafka-delta-ingest](https://github.com/delta-io/kafka-delta-ingest), which +builds on delta-rs to provide a high throughput service to bring data from +Kafka into Delta Lake. + + +
+ +
+ + +Investing in data platform tools and automation is a key part of the vision for +Platform Engineering which encompasses Data Engineering, Data Operations, and +Core Platform. We have a [number of open positions](/careers/#open-positions) +at the moment including a position to work closely with me as [Data Engineering +Manager](https://jobs.lever.co/scribd/e2187c1c-a1d6-4b77-bde6-acc997f68156). +The leader of the Data Engineering team will help deliver data tools and +solutions for internal customers building on top of Delta Lake, Databricks, +Airflow, and Kafka. Suffice it to say, there's a lot of really interesting work +to be done! diff --git a/_posts/2021-07-21-information-extraction-at-scribd.md b/_posts/2021-07-21-information-extraction-at-scribd.md new file mode 100644 index 0000000..6708e45 --- /dev/null +++ b/_posts/2021-07-21-information-extraction-at-scribd.md @@ -0,0 +1,163 @@ +--- +layout: post +title: "Information Extraction at Scribd" +tags: +- machinelearning +- data +- featured +- kyc-series +team: Applied Research +authors: +- antoniam +- rafaelp +--- + +Extracting metadata from our documents is an important part of our discovery +and recommendation pipeline, but discerning useful and relevant details +from text-heavy user-uploaded documents can be challenging. This is +part 2 in a series of blog posts describing a multi-component machine learning +system the Applied Research team built to extract metadata from our documents in order to enrich downstream discovery models. In this post, we present the challenges and +limitations the team faced when building information extraction NLP models for Scribd's +text-heavy documents and how they were solved. + +As mentioned in [part 1](/blog/2021/identifying-document-types.html), we now have a way of identifying text-heavy documents. Having done that, we want to build dedicated models to deepen our semantic understanding of them. We do this by extracting keyphrases and entities. + +
+ Figure 1: Diagram of our multi-component machine learning system. +
Figure 1: Diagram of our multi-component machine learning system.
+
+ +Keyphrases are phrases that represent major themes/topics, whereas entities are proper nouns such as people, places and organizations. For example, when a user uploads a document about the Manhattan project, we will first detect it is text-heavy, then extract keyphrases and entities. Potential keyphrases would be “atomic bomb” and “nuclear weapons” and potential entities would be “Robert Oppenheimer” and “Los Alamos”. + +As keyphrase extraction brings out the general topics discussed in a document, it helps put a cap on the amount of information kept per document, resulting in a somewhat uniform representation of documents irrespective of their original size. Entity extraction, on the other hand, identifies elements in a text that aren't necessarily reflected by keyphrases only. We found the combination of keyphrase and entity extraction to provide a rich semantic description of each document. + +The rest of this post will explain how we approached keyphrase and entity extraction, and how we identified whether a subset of these keyphrases and entities are present in a knowledge base (also known as linking), and introduce how we use them to categorize documents. + +## Keyphrase Extraction + +Typically a keyphrase extraction system operates in two steps as indicated in this survey:  + +- Using heuristics to extract a list of words/phrases that serve as candidate keyphrases, such as part-of-speech language patterns, stopwords filtering, and n-grams with Wikipedia article titles + +- Determining which of these candidate keyphrases are most likely to be keyphrases, using one of the two approaches: + + - Supervised approaches such as binary classification of candidates (useful/not useful), structural features based on positional encoding, etc. + + - Unsupervised approaches such as selecting terms with the highest tf-idf and clustering. + +Training a decent supervised model to be able to extract keyphrases across a wide variety of topics would require a large amount of training data, and might generalize very poorly. For this reason, we decided to take the unsupervised approach. + +Our implementation of keyphrase extraction is optimized for speed without sacrificing keyphrase quality much. We employ both a statistical method and language specific rules to identify them efficiently. + +We simply start by filtering out stopwords and extracting the n-grams with a base n (bi-grams in our case, n=2). This step is fast and straightforward and results in an initial set of candidate n-grams.  + +Limiting the results to a single n-gram class, however, results in split keyphrases, which makes linking them to a knowledge base a challenging task. For that, we attempt to agglomerate lower order n-grams into potentially longer keyphrases, as long as they occur at a predetermined minimum frequency as compared to the shorter n-gram, based on the following a pattern:  + +`A sequence of nouns (NN) possibly interleaved with either Coordinating Conjunctions (CC) or Prepositions and Subordinating Conjunctions (IN).` + +Here are a few examples: + +- Assuming the minimum frequency of agglomeration is 0.5, that means we would only replace the bi-gram `world (NN) health (NN)` by `world (NN) health (NN) organization (NN)` as long as `world health organization` occurs at least 50% as much as `world health` occurs.  + +- Replace `Human (NNP) Development (NNP)` with `Center(NNP) for (IN) Global (NNP) Development (NNP)` only if the latter occurs at least a predetermined percentage of time as compared to the former. + +This method results in more coherent and complete keyphrases that could be linked more accurately to a knowledge base entry. + +Finally we use the count of occurrences of the candidate keyphrase as a proxy to its importance. This method is reliable for longer documents, as the repetition of a keyphrase tends to reliably indicate its centrality to the document’s topic.  + +## Named Entities + +Keyphrases are only one side of finding what’s important in a document. To further capture what a document is about, we must also consider the named entities that are present. + +Named Entity Extraction systems identify instances of named entities in a text, which we can count in order to represent their importance in the document, similar to how we did with keyphrases. + +Naively counting named entities through exact string matches surfaces an interesting problem: a single entity may go by many names or aliases, which means string frequency is an unreliable measurement of importance. In the example given in Figure 2, we know that “MIll”, “John Stuart Mill” and “Stuart Mill” all refer to the same person. This means that Mill is even more central to the document than the table indicates, since he is referred to a total of 8 times instead of 5. + + +
+ Figure 2: Excerpt from John Stuart Mill’s Wikipedia page (left) and Top 5 Named Entity counts of the first few paragraphs (right). +
Figure 2: Excerpt from John Stuart Mill’s Wikipedia page (left) and Top 5 Named Entity counts of the first few paragraphs (right).
+
+ +To address this counting problem, let's introduce a few abstractions: + +- `Named Entity` refers to a unique person, place or organization. Because of their uniqueness, we can represent them with a unique identifier (ID).  + +- `Named Entity Alias` (or simply Alias), is one of possibly many names associated with a particular entity. + +- `Canonical Alias` is the preferred name for an entity. + +- `Named Entity Mention` (or simply `Mention`), refers to each occurrence in a text that a Named Entity was referred to, regardless of which Alias was used. + +- `Knowledge Base` is a collection of entities, allowing us to query for ID, canonical name, aliases and other information that might be relevant for the task at hand. One example is [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page). + +The first step to solve the counting problem is to normalize the names a document uses to refer to a named entity. Using our abstractions, this means we want to find all the mentions in a document, and use its alias to find the named entity it belongs to. Then, replace it with either the canonical name or the named entity ID - this distinction will become clearer later on. + +### Entity Normalization + +Given a set of aliases that appear in a document, we developed heuristics (e.g. common tokens, initials) to identify which subset of aliases refer to the same named entity. This allowed us to limit our search space when comparing aliases. + +Using our previous example to illustrate this method, we start by assuming the canonical alias is the longest alias in a text for a given entity, and attempt to merge aliases together by evaluating which aliases match the heuristics we developed.  + +
+ Table 1: Top 5 occurring aliases in the first few paragraphs of John Stuart Mill’s Wikipedia page, some referring to the same person.
+ +
Table 1: Top 5 occurring aliases in the first few paragraphs of John Stuart Mill’s Wikipedia page, some referring to the same person. +
+
+ +Comparing entities with each other using exact token matching as a heuristic would solve this: + +
+ Table 2: Pairwise alias comparisons and resulting merges. Matches highlighted in bold. +
Table 2: Pairwise alias comparisons and resulting merges. Matches highlighted in bold. +
+
+ +By replacing all mentions with its corresponding canonical alias, we are able to find the correct named entity counts. + +One edge case is when an alias might refer to more than one entity: e.g. the alias “Potter” could refer to the named entities “Harry Potter” or “James Potter” within the Harry Potter universe. To solve this, we built an Entity Linker, which determines which named entity is the most likely to match the alias given the context. This process is further explained in the Linking to a Knowledge Base section. + +When an entity is not present in a knowledge base, we cannot use Named Entity Linking to disambiguate. In this case, our solution uses a fallback method that assigns the ambiguous mention (Potter) to the closest occurring unambiguous mention that matches the heuristics (e.g. Harry).  + +## Linking to a Knowledge Base + +Given that many keyphrases and entities mentioned in a document are notable, they are likely present in a knowledge base. This allows us to leverage extra information present in the knowledge base to improve the normalization step as well as downstream tasks. + +Entity Linking assists normalization by providing information that an alias matches a named entity, which otherwise wouldn't match a heuristic (e.g. “Honest Abe” versus “Abraham Lincoln”). Furthermore, [information in a knowledge base can be used to embed linked entities and keyphrases in the same space as text](https://arxiv.org/abs/1601.01343). + +Being able to embed entities in the same space as text is useful, as this unlocks the ability to [compare possible matching named entity IDs with the context in which they’re mentioned](https://arxiv.org/abs/1911.03814), and make a decision on whether an alias we’re considering might be one of the entities in the knowledge base (in which case we will use IDs), or whether the alias doesn't match any entity in the knowledge base, in which case we fall back to using the assumed canonical alias.  + +At Scribd we make use of Entity Linking to not only improve the Entity Normalization step, but also to take advantage of entity and keyphrase embeddings as supplemental features. + +## Discussion + +Putting all of this together, we can: + +1. Link documents to keyphrases and entities + +1. Find the relative importance of each in a document + +1. Take advantage of relevant information in knowledge bases + +This has enabled some interesting projects: + +In one of them, the Applied Research team built a graph of documents along with their related keyphrases and entities. Embedding documents, keyphrases and entities in the same space allowed us to discover documents by analogy. For example, take `The Count of Monte Cristo` by Alexandre Dumas, a 19th century French novel about revenge. If we add to its embedding the embedding of `science_fiction`, it leads us to a collection of science fiction novels by Jules Verne (another 19th century French author), such as `20,000 Leagues Under the Sea` and `Journey to the Center of the Earth`. + +Keyphrase extractions have also been useful in adding clarity to document clusters. By extracting the most common keyphrases of a cluster, we can derive a common theme for the cluster’s content: + + +
+ Figure 3: Top keyphrases in a document cluster. The keywords imply that the documents therein are related to dentistry & healthcare, which was confirmed by manually inspecting the documents. +
Figure 3: Top keyphrases in a document cluster. The keywords imply that the documents therein are related to dentistry & healthcare, which was confirmed by manually inspecting the documents.
+
+ +In yet another project, the team leveraged precomputed knowledge base embeddings to represent a document in space through a composition of the entities and keyphrases it contains. These features allowed us to understand the documents uploaded by our users and improve the content discovery on the platform. + +To see how we use the information extracted to classify documents into a +taxonomy, make sure to check out [part 3](/blog/2021/categorizing-user-uploaded-documents.html). + +If you're interested to learn more about the problems Applied Research +is solving, or the systems which are built around those solutions, +check out [our open positions!](/careers/#open-positions) + diff --git a/_posts/2021-07-28-categorizing-user-uploaded-documents.md b/_posts/2021-07-28-categorizing-user-uploaded-documents.md new file mode 100644 index 0000000..ddd2349 --- /dev/null +++ b/_posts/2021-07-28-categorizing-user-uploaded-documents.md @@ -0,0 +1,140 @@ +--- +layout: post +title: "Categorizing user-uploaded documents" +tags: +- machinelearning +- data +- featured +- kyc-series +team: Applied Research +author: moniquec +--- + +Scribd offers a variety of publisher and user-uploaded content to our users and +while the publisher content is rich in metadata, user-uploaded content +typically is not. Documents uploaded by the users have varied subjects and +content types which can make it challenging to link them together. One way to +connect content can be through a taxonomy - an important type of structured +information widely used in various domains. In this series, we have already +shared how we [identify document +types](/blog/2021/identifying-document-types.html) and [extract information +from documents](/blog/2021/information-extraction-at-scribd.html), this post +will discuss how insights from data were used to help build the taxonomy and +our approach to assign categories to the user-uploaded documents. + + +## Building the taxonomy + +The unified taxonomy is a tree-structure with two layers that was designed by combining our Subject Matter Experts' (SME) knowledge of the book industry subject headings ([BISAC](https://bisg.org/page/BISACEdition) categories) and data-driven insights. We used user-reading patterns to find topics that could help enrich our unified taxonomy. + +### Data-Driven Insights + +Users have been interacting with Scribd content for more than 10 years, building reading patterns throughout time. We leveraged these reading patterns to create dense vector representations of documents similarly to word2vec in text. + +
+ Schematic representation of our approach: reading sequences are used to create vector representations for user uploaded documents. The vector dimension shown is merely illustrative. +
Figure 1: Schematic representation of our approach: reading sequences are used to create vector representations for user uploaded documents. The vector dimension shown is merely illustrative.
+
+ +For this work we focused only on user uploaded documents and on one type of interaction (reading for a minimum amount of time). The embeddings dimensions (and other hyperparamenters) were chosen to optimize the hit-ratio@20 ([Caselles-Dupré, et al 2018](https://arxiv.org/abs/1804.04212)) increasing how semantically tight the embeddings are. + +Now that we have the embeddings we would like to use them to find groups of documents with similar subjects and topics. Finding these groups will help us identify categories that should be added to the taxonomy. + +Dimensionality reduction allows for dense clusters of documents to be found more efficiently and accurately in the reduced space in comparison to the original high-dimensional space of our embeddings. We reduced the dimension of the embeddings using the [t-SNE](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html) algorithm. t-SNE has a non-linear approach that can capture the smaller relationships between the points, as well as the global structure of the data. We used an implementation of t-SNE (Fast Fourier Transform accelerated Interpolation-based t-SNE” - [FIt-SNE](https://github.com/KlugerLab/FIt-SNE)) that is flexible and does not sacrifice accuracy for speed. + +Finally, we grouped the user-uploaded docs by clustering the reduced embeddings using [HDBSCAN](https://arxiv.org/pdf/1709.04545.pdf). HDBSCAN separates data points into clusters based on the density distribution. It also has a feature to detect noise, which are points that are too far from the nearest detected cluster to belong to it, and lack the density to form their own cluster. + +Figure 2 shows the 2D representation of the user-uploaded documents and their groups. The first thing we noticed and is highlighted in this figure is that the major groups are usually represented by language. Not surprisingly users tend to read content mostly on one single language. + +
+ Figure 2: Initial 2D representation of the embeddings using t-SNE and HDBSCAN. Each colored group represents a cluster found by HDBSCAN. Spread grey points were identified as noise. +
Figure 2: Initial 2D representation of the embeddings using t-SNE and HDBSCAN. Each colored group represents a cluster found by HDBSCAN. Spread grey points were identified as noise.
+
+ +We developed a technique to further split the groups above in smaller clusters that are semantically tighter. The final clusters can be seen in Figure 3. + +
+ Figure 3: Final 2D representation of the embeddings after further splitting of each cluster. Each colored group represents a subcluster found by HDBSCAN for a particular cluster. Spread grey points were identified as noise. +
Figure 3: Final 2D representation of the embeddings after further splitting of each cluster. Each colored group represents a subcluster found by HDBSCAN for a particular cluster. Spread grey points were identified as noise.
+
+ +After we got the clusters and subclusters shown in Figure 3, an inspection of the English subclusters was performed in order to identify their major subjects and themes. This investigation led to the incorporation of additional categories into the taxonomy, such as Philippine law, Study aids & test prep, and Teaching methods & materials, making the taxonomy broader across different content types and the browsing to this content more straightforward. + +## Placing documents into categories + +
+ Figure 4: Diagram of Scribd’s multi-component pipeline. Categorization is one of the downstream tasks highlighted in the diagram. +
Figure 4: Diagram of Scribd’s multi-component pipeline. Categorization is one of the downstream tasks highlighted in the diagram.
+
+ +Now that we have the taxonomy, it is time to place the documents into categories. Our approach leverages the extracted key phrases and entities discussed in [part II](/blog/2021/information-extraction-at-scribd.html) of the series. Figure 5 illustrates how our model works: we trained a supervised model to place documents identified as text-heavy (see [part I](/blog/2021/identifying-document-types.html)) into categories using key phrases, entities and the text. + +
+ Figure 5: Model architecture to categorize docs. +
Figure 5: Model architecture to categorize docs.
+
+ +### Additional insights from data + +In the first iteration of the model, we had a dataset for training collected by our experts to fit the definition of each category. Not surprisingly, upon testing the model on unseen data in production, we realized that for some categories the training set was not a complete representation of the type of documents in production that could fit them. For this reason, the model was unable to generalize with the initial given training set. As an example, in the initial training set most documents about countries other than the US were documents about travel. This means that the model learned that whenever a document mentions other countries, the document is most likely about travel. For this reason, documents about business in South America, for instance, would be placed under travel by the model. + +We applied a technique sometimes referred to as active learning to supplement our training set with the missing examples. Following this technique (Figure 6), the model is applied to a random sample of documents and the results analyzed by our SMEs. + +
+ Figure 6: Active Learning Process used to improve model performance. +
Figure 6: Active Learning Process used to improve model performance.
+
+ +This iterative process had two outcomes: improved the categories performance by re-training the model with a large variety of training example and the addition of a new category after we identified that a good fraction of documents fitted this particular category, + +## Additional Experiments + +Throughout this project several experiments were performed to explore the full potential of the user interaction clusters. Here we will show one exciting example of such experiment. + +#### Giving names to clusters + +As explained above, in general, each subcluster shown in figure 3 is semantically tight which means that the documents belonging to a subcluster are usually about one (or few) topic(s)/subject(s). + +One way to associate topics to the subclusters would require Subject Matter Experts to manually inspect the documents in each subcluster and come up with the most important topics for each of them. However, this approach is not only time consuming, and thus not scalable with new iterations of the model and a likely increasing number of clusters. It is very important to try and make this a more automatic and flexible process. + +We experimented with a very promising two-step approach to automatically assign topics to subclusters. In this approach, we leverage the extracted information from the text described in [part II](/blog/2021/information-extraction-at-scribd.html) and zero-shot topic classification (more info [here](https://arxiv.org/abs/1909.00161)): + +Step 1 - Find the subclusters' most representative key phrases by clustering their documents' extracted info. + +
+ Figure 7: Illustration of Step 1. +
Figure 7: Illustration of Step 1.
+
+ +Step 2 - Use the result of step 1 and zero-shot topic classification to find the highest ranking topics for each subcluster. + +
+ Figure 8: Illustration of Step 2. The bar plot with the highest ranking topics is the result of this approach for a subcluster that contains essays about several literary works. +
Figure 8: Illustration of Step 2. The bar plot with the highest ranking topics is the result of this approach for a subcluster that contains essays about several literary works.
+
+ +As it can be seen in figure 8, a cluster composed of literary works' essays has as the highest ranking topic literary criticism showing the potential of this approach for automatically giving names to user interaction clusters. + +## Conclusion + +Two important takeaways from this journey of categorizing documents were: + +**High quality labeled data** - We found that clean and consistently labelled data was much more important to the model than hyperparameter tuning. However, getting enough documents that fit the categories in our diverse corpus was a challenge. Several techniques were used to improve model performance on unseen data. Among them, active learning proved to be an important way to collect additional training samples and to guarantee the required granularity in the training set. + +**Annotation alignment** - High quality data and model performance are both connected to the annotation process (see more [here](https://www.youtube.com/watch?v=06-AZXmwHjo)). When multiple annotators are involved in the data collection and evaluation, alignment on the definition of each category is crucial for an accurate training and evaluation of the model. This is even more essential in text classification, since associating categories/topics to a text can be a very subjective task, specially when we are dealing with a single-label categorization problem. + +This project was an important milestone in understanding our user-uploaded documents: Classifying documents has enabled users to browse documents by category from our unified taxonomy. Additionally, we now have the power of understanding the categories that each user is interested in and interacts with. Combining the user interests with business metrics could help drive innovative and unexpected product decisions as well as enrich discoverability and recommendations. + +## Next Steps + +**Improve taxonomy using a data driven approach:** + +Moving forward, how can we make sure that newly uploaded documents are covered in our taxonomy? + +Using a data driven approach to build the taxonomy answers these questions and guarantees more flexibility, comprehensiveness, and specificity as opposed to a manually created taxonomy. As new content is uploaded to our platform and read by users, new user interaction clusters will form and help us identify recent user interests. For instance, during the pandemic, users started uploading documents related to Covid-19. Clustering the documents in 2021 for example, yields an additional cluster related to Covid-19, one that did not exist prior to the pandemic. This approach will help us build a less rigid taxonomy, a taxonomy that reflects Scribd’s vast content and is easily expandable in the long run. + +**Multi-language:** + +Now that we understand more our user-uploaded content in English and that we have a consistent pipeline to give labels to these documents, we can extend this approach to other languages + +This work and post were done in collaboration with my colleague [Antonia Mouawad](https://ca.linkedin.com/in/antoniamouawad) on the Applied Research team. If you're interested to learn more about the problems Applied Research is solving, or the systems which are built around those solutions, check out [our open positions](/careers/#open-positions). diff --git a/_posts/2021-09-29-android-audio-player-tutorial-with-armadillo.md b/_posts/2021-09-29-android-audio-player-tutorial-with-armadillo.md new file mode 100644 index 0000000..5f17cad --- /dev/null +++ b/_posts/2021-09-29-android-audio-player-tutorial-with-armadillo.md @@ -0,0 +1,92 @@ +--- +layout: post +title: "Armadillo makes audio players in Android easy" +tags: +- android +- kotlin +- armadillo +- featured +author: nathans +team: Android +--- + +Armadillo is the fully featured audio player library Scribd uses to play and +download all of its audiobooks and podcasts, which is [now open +source](https://github.com/scribd/armadillo). It specializes in playing HLS +or MP3 content that is broken down into chapters or tracks. It leverages +[Google’s Exoplayer](https://github.com/google/ExoPlayer/) library for its audio engine. Exoplayer wraps a variety of +low level audio and video apis but has few opinions of its own for actually +using audio in an Android app. + +![Armadillo Image](https://raw.githubusercontent.com/scribd/armadillo/main/armadillo.webp) + +The leap required from Exoplayer to audio player +is enormous both in terms of the amount of code needed as well as the amount of +domain knowledge required about complex audio related subjects. Armadillo +provides a turn-key solution for powering an audio player and providing the +information to update a UI. + +- **Easy-to-use** because it outputs state updates with everything needed for a UI or analytics. Works in the background state. +- **Effective** because it uses Google’s Exoplayer as the playback engine. +- **Ready-to-go** out of the box usage for a developer looking to use an audio player. +- **Robust** because it contains numerous configuration options for supporting most any requirement and includes a number of other android apis +required for a high quality audio player. + +## What does it include? + +- Support for HLS and MP3 audio +- Exoplayer for downloading and playback +- [MediaBrowserService](https://developer.android.com/reference/android/service/media/MediaBrowserService) so the app can be played in the background, browsed by other apps, and integrated with Android Auto. +- [MediaSession](https://developer.android.com/reference/android/media/session/MediaSession) to support commands from media controllers, ex. a bluetooth headset. + +## Getting Started: + +The library is hosted with Github packages so you will need to add the Github registry with authentication to your build.gradle file. See the official docs on authenticating [here](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-gradle-registry#authenticating-to-github-packages). But you will need to: + +1. Generate a [personal access token](https://docs.github.com/en/github/authenticating-to-github/keeping-your-account-and-data-secure/creating-a-personal-access-token) from your Github account. +1. Add the Github package registry with authentication to your `build.gradle` file. + +```kotlin +maven { + name = "GitHubPackages" + url = uri("https://maven.pkg.github.com/scribd/armadillo-and") + credentials { + username = "github_username" + password = "github_access_token" + } +} +``` + +It is as easy as adding this code snippet to your Activity / Fragment to play your first piece of content. + +```kotlin +// construct your media +val media = AudioPlayable( + id = 0, + title = "Google Hosted Mp3", + request = AudioPlayable.MediaRequest.createHttpUri("https://storage.googleapis.com/exoplayer-test-media-0/play.mp3"), + chapters = emptyList() +) + +// initialize the player +val armadilloPlayer = ArmadilloPlayerFactory.init() + +// begin playback +armadilloPlayer.beginPlayback(media) + +// listen for state updates +armadilloPlayer.armadilloStateObservable.subscribe { + + // update your UI here + +} +``` + +That’s all you need to get started! + +## Next Steps: + +For a more complex example, please see the [TestApp](https://github.com/scribd/armadillo/tree/main/TestApp) included in the library. If +you have any problems, don’t be afraid to open up an issue [on +GitHub](https://github.com/scribd/armadillo). + diff --git a/_posts/2022-04-28-data-ai-summit-2022.md b/_posts/2022-04-28-data-ai-summit-2022.md new file mode 100644 index 0000000..8916901 --- /dev/null +++ b/_posts/2022-04-28-data-ai-summit-2022.md @@ -0,0 +1,45 @@ +--- +layout: post +title: "Scribd is presenting at Data and AI Summit 2022" +team: Core Platform +author: rtyler +tags: +- databricks +- kafka +- deltalake +- featured +--- + +We are very excited to be presenting and attending this year's [Data and AI +Summit](https://databricks.com/dataaisummit/north-america-2022) which will be +hosted virtually and physically in San Francisco from June 27th-30th. +Throughout the course of 2021 we completed a number of really interesting +projects built around [delta-rs](https://github.com/delta-io/delta-rs) and the +Databricks platform which we are thrilled to share with a broader audience. +In addition to the presentations listed below, a number of Scribd engineers who +are responsible for data and ML platform, machine learning systems, and more, +will be in attendance if you want to meet up and learn more about how Scribd +uses data and ML to change the way the world reads! + + +* [Christian Williams](https://github.com/xianwill) will be sharing some of the +work he has done developing +[kafka-delta-ingest](https://github.com/scribd/kafka-delta-ingest) in his talk: +**[Streaming Data into Delta Lake with Rust and +Kafka](https://databricks.com/dataaisummit/north-america-2022/agenda/?sessionid=1834)** +* [QP Hou](https://github.com/houqp), Scribd Emeritus, will be presenting on +his foundational work to ensure correctness within delta-rs during his session: +**[Ensuring Correct Distributed Writes to Delta Lake in Rust with Formal +Verification](https://databricks.com/dataaisummit/north-america-2022/agenda/?sessionid=1623)** +* [R Tyler Croy](https://github.com/rtyler) will be co-presenting with Gavin +Edgley from Databricks on the cost analysis work Scribd has done to efficiently +grow our data platform with **[Doubling the size of the data lake without doubling the +cost](https://databricks.com/dataaisummit/north-america-2022/agenda/?sessionid=2366)** + + +There are so many great sessions to watch in person or online during the event, +particularly around [Delta Lake](https://delta.io), which is one of our +favorite technologies and powers our entire data platform. We are also +expecting some great ML related talks as data and ML begin to overlap more and +more. We hope to see you there! + diff --git a/_posts/2022-06-28-databricks-serverless.md b/_posts/2022-06-28-databricks-serverless.md new file mode 100644 index 0000000..1007238 --- /dev/null +++ b/_posts/2022-06-28-databricks-serverless.md @@ -0,0 +1,58 @@ +--- +layout: post +title: "Accelerating Looker with Databricks SQL Serverless" +tags: +- looker +- databricks +- featured +team: Core Platform +author: hamiltonh +--- + +We recently migrated Looker to a Databricks SQL Serverless, improving our +infrastructure cost and reducing the footprint of infrastructure we need to +worry about! “Databricks SQL” which provides a single load balanced Warehouse +for executing Spark SQL queries across multiple Spark clusters behind the +scenes. “Serverless” is an evolution of that concept, rather than running a SQL +Warehouse in our AWS infrastructure, the entirety of execution happens on the +Databricks side. With a much simpler and faster interface, queries executed in +Looker now return results much faster to our users than ever before! + +When we originally provisioned our “Databricks SQL” warehouses, we worked +together with our colleagues at Databricks to ensure [the terraform provider +for Databricks](https://github.com/databricks/terraform-provider-databricks) is +ready for production usage, which as of today is Generally Available. That +original foundation in Terraform allowed us to more easily adopt SQL Serverless +once it was made available to us. + +```hcl +resource "databricks_sql_warehouse" "warehouse" { + name = "Looker Serverless" + # ... + enable_serverless_compute = true + # ... +} +``` + +The feature was literally brand new so there were a few integration hurdles we +had to work through with our colleagues at Databricks, but we got things up and +running in short order. By adopting SQL Serverless, we could avoid setting up +special networking, IAM roles, and other resources within our own AWS account, +we can instead rely on pre-provisioned compute resources within Databricks' own +infrastructure. No more headache of ensuring all of the required infra is in +place and setup correctly! + +The switch to Serverless reduced our infra configuration and management +footprint, which by itself is an improvement. We also noticed a significant +reduction in cold start times for the SQL Serverless Warehouse compared to the +standard SQL Warehouse. The faster start-up times meant we could configure even +lower auto-terminate times on the warehouse, savings us even more on +unproductive and idle cluster costs. + +On the Looker side there really wasn’t any difference in the connection +configuration other than a URL change. In the end, after some preparation work +a simple 5 minute change in Looker, and a simple 5 minute change in Terraform +switched everything over to Databricks SQL Serverless, and we were ready to +rock! Our BI team is very happy with the performance, especially on cold start +queries. Our CFO is happy about reducing infrastructure costs. And I’m happy +about simpler infrastructure! diff --git a/_posts/2022-07-21-data-ai-summit-videos.md b/_posts/2022-07-21-data-ai-summit-videos.md new file mode 100644 index 0000000..828f149 --- /dev/null +++ b/_posts/2022-07-21-data-ai-summit-videos.md @@ -0,0 +1,45 @@ +--- +layout: post +title: "Data and AI Summit Wrap-up" +team: Core Platform +author: rtyler +tags: +- databricks +- kafka +- deltalake +- featured +--- + +We brought a whole team to San Francisco to present and attend this year's Data and +AI Summit, and it was a blast! +I +would consider the event a success both in the attendance to the Scribd hosted +talks and the number of talks which discussed patterns we have adopted in our +own data and ML platform. +The three talks I [wrote about +previously](/blog/2022/data-ai-summit-2022.html) were well received and have +since been posted to YouTube along with _hundreds_ of other talks. + +* [Christian Williams](https://github.com/xianwill) shared some of the +work he has done developing +[kafka-delta-ingest](https://github.com/scribd/kafka-delta-ingest) in his talk: +[![Streaming Data into Delta Lake with Rust and Kafka](https://img.youtube.com/vi/do4jsxeKfd4/hqdefault.jpg)](https://www.youtube.com/watch?v=do4jsxeKfd4&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=195) +* [QP Hou](https://github.com/houqp), Scribd Emeritus, presented on +his foundational work to ensure correctness within delta-rs during his session: +[![Ensuring Correct Distributed Writes to Delta Lake in Rust with Formal +Verification](https://img.youtube.com/vi/ABoCnrVWCKY/hqdefault.jpg)](https://www.youtube.com/watch?v=ABoCnrVWCKY&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=112) +* [R Tyler Croy](https://github.com/rtyler) co-presented with Gavin +Edgley from Databricks on the cost analysis work Scribd has done to efficiently +grow our data platform with: +[![Doubling the size of the data lake without doubling the cost](https://img.youtube.com/vi/9QDRD0PzqCE/hqdefault.jpg)](https://www.youtube.com/watch?v=9QDRD0PzqCE&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=122) + +Members of the Scribd team participated in a panel to discuss the past, +present, and future of Delta Lake on the expo floor. We also took advantage of +the time to have multiple discussions with our colleagues at Databricks about +their product and engineering roadmap, and where we can work together to +improve the future of Delta Lake, Unity catalog, and more. + +For those working in the data, ML, or infrastructure space, there are a lot of +_great_ talks available online from the event, which I highly recommend +checking out. Data and AI Summit is a great event for leaders in the industry +to get together, so we'll definitely be back next year! diff --git a/_posts/2024-02-05-evolution-of-mlplatform.md b/_posts/2024-02-05-evolution-of-mlplatform.md new file mode 100644 index 0000000..37f22c2 --- /dev/null +++ b/_posts/2024-02-05-evolution-of-mlplatform.md @@ -0,0 +1,133 @@ +--- +layout: post +title: "The Evolution of the Machine Learning Platform" +team: Machine Learning Platform +author: bshaw +tags: +- mlops +- featured +- ml-platform-series +--- + +Machine Learning Platforms (ML Platforms) have the potential to be a key component in achieving production ML at scale without large technical debt, yet ML Platforms are not often understood. This document outlines the key concepts and paradigm shifts that led to the conceptualization of ML Platforms in an effort to increase an understanding of these platforms and how they can best be applied. + + +Technical Debt and development velocity defined +----------------------------------------------- + +### Development Velocity + +Machine learning development velocity refers to the speed and efficiency at which machine learning (ML) projects progress from the initial concept to deployment in a production environment. It encompasses the entire lifecycle of a machine learning project, from data collection and preprocessing to model training, evaluation, validation deployment and testing for new models or for re-training, validation and deployment of existing models. + +### Technical Debt + +The term "technical debt" in software engineering was coined by Ward Cunningham, Cunningham used the metaphor of financial debt to describe the trade-off between implementing a quick and dirty solution to meet immediate needs (similar to taking on financial debt for short-term gain) versus taking the time to do it properly with a more sustainable and maintainable solution (akin to avoiding financial debt but requiring more upfront investment). Just as financial debt accumulates interest over time, technical debt can accumulate and make future development more difficult and expensive. + +The idea behind technical debt is to highlight the consequences of prioritizing short-term gains over long-term maintainability and the need to address and pay off this "debt" through proper refactoring and improvements. The term has since become widely adopted in the software development community to describe the accrued cost of deferred work on a software project. + +### Technical Debt in Machine Learning + +Originally a software engineering concept, Technical debt is also relevant to Machine Learning Systems infact the landmark google paper suggest that ML systems have the propensity to easily gain this technical debt. + +> Machine learning offers a fantastically powerful toolkit for building useful complex prediction systems quickly. This paper argues it is dangerous to think of these quick wins as coming for free. Using the software engineering framework of technical debt , we find it is common to incur massive ongoing maintenance costs in real-world ML systems +> +> [Sculley et al (2021) Hidden Technical Debt in Machine Learning Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) + +> As the machine learning (ML) community continues to accumulate years of experience with livesystems, a wide-spread and uncomfortable trend has emerged: developing and deploying ML sys-tems is relatively fast and cheap, but maintaining them over time is difficult and expensive +> +> [Sculley et al (2021) Hidden Technical Debt in Machine Learning Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) + +Technical debt is important to consider especially when trying to move fast. Moving fast is easy, moving fast without acquiring technical debt is alot more complicated. + +The Evolution Of ML Platforms +----------------------------- + +### DevOps -- The paradigm shift that led the way + +DevOps is a methodology in software development which advocates for teams owning the entire software development lifecycle. This paradigm shift from fragmented teams to end-to-end ownership enhances collaboration and accelerates delivery. Dev ops has become standard practice in modern software development and the adoption of DevOps has been widespread, with many organizations considering it an essential part of their software development and delivery processes. Some of the principles of DevOps are: + +1. **Automation** + +2. **Continuous Testing** + +3. **Continuous Monitoring** + +4. **Collaboration and Communication** + +5. **Version Control** + +6. **Feedback Loops** + + +### Platforms -- Reducing Cognitive Load + +This shift to DevOps and teams teams owning the entire development lifecycle introduces a new challenge—additional cognitive load. Cognitive load can be defined as + +> The total amount of mental effort a team uses to understand, operate and maintain their designated systems or tasks. +> +> [Skelton & Pais (2019) Team Topologies](https://teamtopologies.com/book) + +The weight of the additional load introduced in DevOps of teams owning the entire software development lifecycle can hinder productivity, prompting organizations to seek solutions. + +Platforms emerged as a strategic solution, delicately abstracting unnecessary details of the development lifecycle. This abstraction allows engineers to focus on critical tasks, mitigating cognitive load and fostering a more streamlined workflow. + +> The purpose of a platform team is to enable stream-aligned teams to deliver work with substantial autonomy. The stream-aligned team maintains full ownership of building, running, and fixing their application in production. The platform team provides internal services to reduce the cognitive load that would be required from stream-aligned teams to develop these underlying services. +> +> [Skelton & Pais (2019) Team Topologies](https://teamtopologies.com/book) + +> Infrastructure Platform teams enable organisations to scale delivery by solving common product and non-functional requirements with resilient solutions. This allows other teams to focus on building their own things and releasing value for their users +> +> [Rowse & Shepherd (2022) Building Infrastructure Platforms](https://martinfowler.com/articles/building-infrastructure-platform.html) + +### ML Ops -- Reducing technical debt of machine learning + +The ability of ML systems to rapidly accumulate technical debt has given rise to the concept of MLOps. MLOps is a methodology that takes inspiration from and incorporates best practices of the DevOps, tailoring them to address the distinctive challenges inherent in machine learning. MLOps applies the established principles of DevOps to machine learning, recognizing that merely a fraction of real-world ML systems comprises the actual ML code. Serving as a crucial bridge between development and the ongoing intricacies of maintaining ML systems. +MLOps is a methodology that provides a collection of concepts and workflows designed to promote efficiency, collaboration, and sustainability of the ML Lifecycle. Correctly applied MLOps can play a pivotal role controlling technical debt and ensuring the efficiency, reliability, and scalability of the machine learning lifecycle over time. + +Scribd's ML Platform -- MLOps and Platforms in Action +------------------------------------- +At Scribd we have developed a machine learning platform which provides a curated developer experience for machine learning developers. This platform has been built with MLOps in mind which can be seen through its use of common DevOps principles. + +1. **Automation:** + * Applying CI/CD strategies to model deployments through the use of Jenkins pipelines which deploy models from the Model Registry to AWS based endpoints. + * Automating Model training throug the use of Airflow DAGS and allowing these DAGS to trigger the deployment pipelines to deploy a model once re-training has occured. + +2. **Continuous** **Testing:** + * Applying continuous testing as part of a model deployment pipeline, removing the need for manual testing. + * Increased tooling to support model validation testing. + +3. **Monitoring:** + * Monitoring real time inference endpoints + * Monitoring training DAGS + * Monitoring batch jobs + +4. **Collaboration and Communication:** + * Feature Store which provides feature discovery and re-use + * Model Database which provides model collaboration + +6. **Version Control:** + * Applying version control to experiments, machine learning models and features + + +References +---------- + +[Bottcher (2018, March 05). What I Talk About When I Talk About Platforms. https://martinfowler.com/articles/talk-about-platforms.html](https://martinfowler.com/articles/talk-about-platforms.html) + +[D. Sculley, Gary Holt, Daniel Golovin, Eugene Davydov, Todd Phillips, Dietmar Ebner, Vinay Chaudhary, Michael Young, Jean-Franc¸ois Crespo, Dan Dennison (2021) Hidden Technical Debt in Machine Learning Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) + +[Fowler (2022, October 20).Conway's Law. https://martinfowler.com/bliki/ConwaysLaw.html](https://martinfowler.com/bliki/ConwaysLaw.html) + +[Galante, what is platform engineering. https://platformengineering.org/blog/what-is-platform-engineering](https://platformengineering.org/blog/what-is-platform-engineering) + +[Humanitect, State of Platform Engineering Report](https://www.scribd.com/document/611845499/Whitepaper-State-of-Platform-Engineering-Report) + +[Hodgson (2023, July 19).How platform teams get stuff done. https://martinfowler.com/articles/platform-teams-stuff-done.html](https://martinfowler.com/articles/platform-teams-stuff-done.html) + +[Murray (2017, April 27. The Art of Platform Thinking. https://www.thoughtworks.com/insights/blog/platforms/art-platform-thinking)](https://www.thoughtworks.com/insights/blog/platforms/art-platform-thinking) + +[Rouse (2017, March 20). Technical Debt. https://www.techopedia.com/definition/27913/technical-debt](https://www.techopedia.com/definition/27913/technical-debt) + +[Rowse & Shepherd (2022).Building Infrastructure Platforms. https://martinfowler.com/articles/building-infrastructure-platform.html](https://martinfowler.com/articles/building-infrastructure-platform.html) + +[Skelton & Pais (2019) Team Topologies](https://teamtopologies.com/book) diff --git a/_posts/2025-01-15-cloud-native-data-ingestion.md b/_posts/2025-01-15-cloud-native-data-ingestion.md new file mode 100644 index 0000000..2df1b88 --- /dev/null +++ b/_posts/2025-01-15-cloud-native-data-ingestion.md @@ -0,0 +1,35 @@ +--- +layout: post +title: "Cloud-native Data Ingestion with AWS Aurora and Delta Lake" +team: "Core Infrastructure" +author: rtyler +tags: +- deltalake +- rust +- featured +--- + + +One of the major themes for Infrastructure Engineering over the past couple +years has been higher reliability and better operational efficiency. In a +recent session with the [Delta Lake](https://delta.io) project I was able to +share the work led Kuntal Basu and a number of other people to _dramatically_ +improve the efficiency and reliability of our online data ingestion pipeline. + + +> Join Kuntal Basu, Staff Infrastructure Engineer, and R. Tyler Croy, Principal +> Engineer at Scribd, Inc. as they take you behind the scenes of Scribd’s data +> ingestion setup. They’ll break down the architecture, explain the tools, and +> walk you through how they turned off-the-shelf solutions into a robust +> pipeline. + + +## Video + +
+ + +## Presentation + +
+ diff --git a/_posts/2025-03-14-terraform-oxbow-module.md b/_posts/2025-03-14-terraform-oxbow-module.md new file mode 100644 index 0000000..ab48af2 --- /dev/null +++ b/_posts/2025-03-14-terraform-oxbow-module.md @@ -0,0 +1,61 @@ +--- +layout: post +title: "Terraform module to manage Oxbow Lambda and its components" +tags: +- Oxbow +- Terraform +- AWS +- deltalake +- rust +team: Core Infrastructure +author: Oleh Motrunych +--- + + +[Oxbow](https://github.com/buoyant-data/oxbow) is a project to take an existing storage location which contains [Apache Parquet](https://parquet.apache.org/) files into a [Delta Lake table](https://delta.io/). +It is intended to run both as an AWS Lambda or as a command line application. +We are excited to introduce [terraform-oxbow](https://github.com/scribd/terraform-oxbow), an open-source Terraform module that simplifies the deployment and management of AWS Lambda and its supporting components. Whether you're working with AWS Glue, Kinesis Data Firehose, SQS, or DynamoDB, this module provides a streamlined approach to infrastructure as code (IaC) in AWS. + +### ✨ Why terraform-oxbow? +Managing event-driven architectures in AWS can be complex, requiring careful orchestration of multiple services. Terraform-oxbow abstracts much of this complexity, enabling users to configure key components with simple boolean flags and module parameters. This is an easy and efficient way to have Delta table created using Apache Parquet files. +### 🚀Features + +With **terraform-oxbow**, you can deploy: + +- AWS Oxbow Lambda with customizable configurations +- Kinesis Data Firehose for real-time data streaming +- SQS and SQS Dead Letter Queues for event-driven messaging +- IAM policies for secure access management +- S3 bucket notifications to trigger Lambda functions +- DynamoDB tables for data storage and locking +- AWS Glue Catalog and Tables for schema management + + +### ⚙️ How It Works + +This module follows a modular approach, allowing users to enable or disable services based on their specific use case. Here are a few examples: + +- To enable AWS Glue Catalog and Tables: ```hcl +enable_aws_glue_catalog_table = true +``` + +- To enable Kinesis Data Firehose delivery stream ```hcl +enable_kinesis_firehose_delivery_stream = true +``` + +- To enable S3 bucket notifications ```hcl +enable_bucket_notification = true +``` + +- To enable advanced Oxbow Lambda setup for multi-table filtered optimization ```hcl +enable_group_events = true +``` + +- AWS S3 bucket notifications have limitations: Due to AWS constraints, an S3 bucket can only have a single notification configuration per account. If you need to trigger multiple Lambda functions from the same S3 bucket, consider using event-driven solutions like SNS or SQS. + + +- IAM Policy Management: The module provides the necessary permissions but follows the principle of least privilege. Ensure your IAM policies align with your security requirements. + + +- Scalability and Optimization: The module allows fine-grained control over Lambda concurrency, event filtering, and data processing configurations to optimize costs and performance + diff --git a/assets/js/jobs.js b/assets/js/jobs.js index 07d3b18..8172cb9 100644 --- a/assets/js/jobs.js +++ b/assets/js/jobs.js @@ -5,7 +5,7 @@ * * With that disclaimer out of the way... * - * This file handles the fetching of jobs from Lever such that they can be + * This file handles the fetching of jobs from Lever^WAshby such that they can be * dynamically inserted into different parts of the tech blog */ @@ -13,7 +13,7 @@ * This API will return an list of departments which must then be filtered * through to find the .postings under each */ -const API_URL = 'https://api.lever.co/v0/postings/scribd?group=department&mode=json' +const API_URL = 'https://api.ashbyhq.com/posting-api/job-board/scribd?includeCompensation=true' /* @@ -37,21 +37,20 @@ function fetchJobs() { return fetch(API_URL) .then(async (response) => { - const departments = await response.json(); + const board = await response.json(); /* * Since this is the tech blog, we're only pulling a couple of * departments */ - departments - .filter(d => ['Engineering', 'Data Science', 'Design', 'Business Analytics', 'Product'].includes(d.title)) - .forEach((department) => { - department.postings.forEach((posting) => { - const team = posting.categories.team; + board.jobs + .filter(j => ['Engineering', 'Product, Design, & Analytics', 'Product'].includes(j.department)) + .filter(j => !j.title.toLowerCase().includes('marketing')) + .forEach((job) => { + const team = job.team; if (!window.jobsCache[team]) { window.jobsCache[team] = []; } - window.jobsCache[team].push(posting); - }); + window.jobsCache[team].push(job); }); window.jobsFetched = true; return window.jobsCache; @@ -98,9 +97,9 @@ function renderJobs(elem, team, randomLimit) { li.innerHTML = `
- ${job.text} + ${job.title}
-

${job.categories.location || ''}

+

${job.location || ''}

`; elem.appendChild(li); diff --git a/careers.html b/careers.html index 1965edf..5a5072a 100644 --- a/careers.html +++ b/careers.html @@ -18,7 +18,7 @@ alt="two people sitting around a sofa reading on a computer and tablet">
-

Help us change the way the world reads.

+

Help us build the largest and most accessible library connecting storytellers with their audience.

Our readers are on a mission to become their best selves, and so are we. We’re not afraid to take risks because we know that — win or lose — we’ll learn from them.

If you’re a talented team player and want to work somewhere where your input matters, we’d love to talk with you.

@@ -159,14 +159,14 @@

${team} li.innerHTML = `
- - ${job.text} + + ${job.title}
-

${job.categories.location || ''}

+

${job.location || ''}

- `; diff --git a/tag/mlops/index.md b/tag/mlops/index.md new file mode 100644 index 0000000..b51bead --- /dev/null +++ b/tag/mlops/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: mlops" +tag: mlops +robots: noindex +---