diff --git a/.gitignore b/.gitignore index 045c10b..90a4b88 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ _site .jekyll-metadata vendor *.sw* +.idea/ diff --git a/Gemfile b/Gemfile index fd51732..f0f42ae 100644 --- a/Gemfile +++ b/Gemfile @@ -1,18 +1,21 @@ source "https://rubygems.org/" -gem "jekyll", "~> 3.8.5" +#gem "jekyll", "~> 4.0" gem 'kramdown' gem 'rouge' # If you want to use GitHub Pages, remove the "gem "jekyll"" above and # uncomment the line below. To upgrade, run `bundle update github-pages`. -gem "github-pages", "~> 201", group: :jekyll_plugins +gem "github-pages", group: :jekyll_plugins # If you have any plugins, put them here! group :jekyll_plugins do - gem "jekyll-feed", "~> 0.11" + gem "jekyll-feed" + gem 'jekyll-paginate' end +gem 'webrick' + # Windows and JRuby does not include zoneinfo files, so bundle the tzinfo-data gem # and associated library. install_if -> { RUBY_PLATFORM =~ %r!mingw|mswin|java! } do diff --git a/Gemfile.lock b/Gemfile.lock index a243b05..9859e75 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,226 +1,252 @@ GEM remote: https://rubygems.org/ specs: - activesupport (4.2.11.1) - i18n (~> 0.7) + activesupport (6.0.6.1) + concurrent-ruby (~> 1.0, >= 1.0.2) + i18n (>= 0.7, < 2) minitest (~> 5.1) - thread_safe (~> 0.3, >= 0.3.4) tzinfo (~> 1.1) - addressable (2.7.0) + zeitwerk (~> 2.2, >= 2.2.2) + addressable (2.8.0) public_suffix (>= 2.0.2, < 5.0) coffee-script (2.4.1) coffee-script-source execjs coffee-script-source (1.11.1) colorator (1.1.0) - commonmarker (0.17.13) - ruby-enum (~> 0.5) - concurrent-ruby (1.1.5) - dnsruby (1.61.3) - addressable (~> 2.5) - em-websocket (0.5.1) + commonmarker (0.23.10) + concurrent-ruby (1.2.0) + dnsruby (1.61.9) + simpleidn (~> 0.1) + em-websocket (0.5.3) eventmachine (>= 0.12.9) - http_parser.rb (~> 0.6.0) - ethon (0.12.0) - ffi (>= 1.3.0) + http_parser.rb (~> 0) + ethon (0.15.0) + ffi (>= 1.15.0) eventmachine (1.2.7) - execjs (2.7.0) - faraday (0.17.0) + execjs (2.8.1) + faraday (1.10.0) + faraday-em_http (~> 1.0) + faraday-em_synchrony (~> 1.0) + faraday-excon (~> 1.1) + faraday-httpclient (~> 1.0) + faraday-multipart (~> 1.0) + faraday-net_http (~> 1.0) + faraday-net_http_persistent (~> 1.0) + faraday-patron (~> 1.0) + faraday-rack (~> 1.0) + faraday-retry (~> 1.0) + ruby2_keywords (>= 0.0.4) + faraday-em_http (1.0.0) + faraday-em_synchrony (1.0.0) + faraday-excon (1.1.0) + faraday-httpclient (1.0.1) + faraday-multipart (1.0.3) multipart-post (>= 1.2, < 3) - ffi (1.11.1) + faraday-net_http (1.0.1) + faraday-net_http_persistent (1.2.0) + faraday-patron (1.0.0) + faraday-rack (1.0.0) + faraday-retry (1.0.3) + ffi (1.15.5) forwardable-extended (2.6.0) gemoji (3.0.1) - github-pages (201) - activesupport (= 4.2.11.1) - github-pages-health-check (= 1.16.1) - jekyll (= 3.8.5) - jekyll-avatar (= 0.6.0) + github-pages (226) + github-pages-health-check (= 1.17.9) + jekyll (= 3.9.2) + jekyll-avatar (= 0.7.0) jekyll-coffeescript (= 1.1.1) - jekyll-commonmark-ghpages (= 0.1.6) + jekyll-commonmark-ghpages (= 0.2.0) jekyll-default-layout (= 0.1.4) - jekyll-feed (= 0.11.0) + jekyll-feed (= 0.15.1) jekyll-gist (= 1.5.0) - jekyll-github-metadata (= 2.12.1) - jekyll-mentions (= 1.4.1) - jekyll-optional-front-matter (= 0.3.0) + jekyll-github-metadata (= 2.13.0) + jekyll-include-cache (= 0.2.1) + jekyll-mentions (= 1.6.0) + jekyll-optional-front-matter (= 0.3.2) jekyll-paginate (= 1.1.0) - jekyll-readme-index (= 0.2.0) - jekyll-redirect-from (= 0.14.0) - jekyll-relative-links (= 0.6.0) - jekyll-remote-theme (= 0.4.0) + jekyll-readme-index (= 0.3.0) + jekyll-redirect-from (= 0.16.0) + jekyll-relative-links (= 0.6.1) + jekyll-remote-theme (= 0.4.3) jekyll-sass-converter (= 1.5.2) - jekyll-seo-tag (= 2.5.0) - jekyll-sitemap (= 1.2.0) - jekyll-swiss (= 0.4.0) - jekyll-theme-architect (= 0.1.1) - jekyll-theme-cayman (= 0.1.1) - jekyll-theme-dinky (= 0.1.1) - jekyll-theme-hacker (= 0.1.1) - jekyll-theme-leap-day (= 0.1.1) - jekyll-theme-merlot (= 0.1.1) - jekyll-theme-midnight (= 0.1.1) - jekyll-theme-minimal (= 0.1.1) - jekyll-theme-modernist (= 0.1.1) - jekyll-theme-primer (= 0.5.3) - jekyll-theme-slate (= 0.1.1) - jekyll-theme-tactile (= 0.1.1) - jekyll-theme-time-machine (= 0.1.1) - jekyll-titles-from-headings (= 0.5.1) - jemoji (= 0.10.2) - kramdown (= 1.17.0) - liquid (= 4.0.0) - listen (= 3.1.5) + jekyll-seo-tag (= 2.8.0) + jekyll-sitemap (= 1.4.0) + jekyll-swiss (= 1.0.0) + jekyll-theme-architect (= 0.2.0) + jekyll-theme-cayman (= 0.2.0) + jekyll-theme-dinky (= 0.2.0) + jekyll-theme-hacker (= 0.2.0) + jekyll-theme-leap-day (= 0.2.0) + jekyll-theme-merlot (= 0.2.0) + jekyll-theme-midnight (= 0.2.0) + jekyll-theme-minimal (= 0.2.0) + jekyll-theme-modernist (= 0.2.0) + jekyll-theme-primer (= 0.6.0) + jekyll-theme-slate (= 0.2.0) + jekyll-theme-tactile (= 0.2.0) + jekyll-theme-time-machine (= 0.2.0) + jekyll-titles-from-headings (= 0.5.3) + jemoji (= 0.12.0) + kramdown (= 2.3.2) + kramdown-parser-gfm (= 1.1.0) + liquid (= 4.0.3) mercenary (~> 0.3) - minima (= 2.5.0) - nokogiri (>= 1.10.4, < 2.0) - rouge (= 3.11.0) + minima (= 2.5.1) + nokogiri (>= 1.13.4, < 2.0) + rouge (= 3.26.0) terminal-table (~> 1.4) - github-pages-health-check (1.16.1) + github-pages-health-check (1.17.9) addressable (~> 2.3) dnsruby (~> 1.60) octokit (~> 4.0) - public_suffix (~> 3.0) + public_suffix (>= 3.0, < 5.0) typhoeus (~> 1.3) - html-pipeline (2.12.0) + html-pipeline (2.14.1) activesupport (>= 2) nokogiri (>= 1.4) - http_parser.rb (0.6.0) + http_parser.rb (0.8.0) i18n (0.9.5) concurrent-ruby (~> 1.0) - jekyll (3.8.5) + jekyll (3.9.2) addressable (~> 2.4) colorator (~> 1.0) em-websocket (~> 0.5) i18n (~> 0.7) jekyll-sass-converter (~> 1.0) jekyll-watch (~> 2.0) - kramdown (~> 1.14) + kramdown (>= 1.17, < 3) liquid (~> 4.0) mercenary (~> 0.3.3) pathutil (~> 0.9) rouge (>= 1.7, < 4) safe_yaml (~> 1.0) - jekyll-avatar (0.6.0) - jekyll (~> 3.0) + jekyll-avatar (0.7.0) + jekyll (>= 3.0, < 5.0) jekyll-coffeescript (1.1.1) coffee-script (~> 2.2) coffee-script-source (~> 1.11.1) - jekyll-commonmark (1.3.1) - commonmarker (~> 0.14) - jekyll (>= 3.7, < 5.0) - jekyll-commonmark-ghpages (0.1.6) - commonmarker (~> 0.17.6) - jekyll-commonmark (~> 1.2) + jekyll-commonmark (1.4.0) + commonmarker (~> 0.22) + jekyll-commonmark-ghpages (0.2.0) + commonmarker (~> 0.23.4) + jekyll (~> 3.9.0) + jekyll-commonmark (~> 1.4.0) rouge (>= 2.0, < 4.0) jekyll-default-layout (0.1.4) jekyll (~> 3.0) - jekyll-feed (0.11.0) - jekyll (~> 3.3) + jekyll-feed (0.15.1) + jekyll (>= 3.7, < 5.0) jekyll-gist (1.5.0) octokit (~> 4.2) - jekyll-github-metadata (2.12.1) - jekyll (~> 3.4) + jekyll-github-metadata (2.13.0) + jekyll (>= 3.4, < 5.0) octokit (~> 4.0, != 4.4.0) - jekyll-mentions (1.4.1) + jekyll-include-cache (0.2.1) + jekyll (>= 3.7, < 5.0) + jekyll-mentions (1.6.0) html-pipeline (~> 2.3) - jekyll (~> 3.0) - jekyll-optional-front-matter (0.3.0) - jekyll (~> 3.0) + jekyll (>= 3.7, < 5.0) + jekyll-optional-front-matter (0.3.2) + jekyll (>= 3.0, < 5.0) jekyll-paginate (1.1.0) - jekyll-readme-index (0.2.0) - jekyll (~> 3.0) - jekyll-redirect-from (0.14.0) - jekyll (~> 3.3) - jekyll-relative-links (0.6.0) - jekyll (~> 3.3) - jekyll-remote-theme (0.4.0) + jekyll-readme-index (0.3.0) + jekyll (>= 3.0, < 5.0) + jekyll-redirect-from (0.16.0) + jekyll (>= 3.3, < 5.0) + jekyll-relative-links (0.6.1) + jekyll (>= 3.3, < 5.0) + jekyll-remote-theme (0.4.3) addressable (~> 2.0) - jekyll (~> 3.5) - rubyzip (>= 1.2.1, < 3.0) + jekyll (>= 3.5, < 5.0) + jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0) + rubyzip (>= 1.3.0, < 3.0) jekyll-sass-converter (1.5.2) sass (~> 3.4) - jekyll-seo-tag (2.5.0) - jekyll (~> 3.3) - jekyll-sitemap (1.2.0) - jekyll (~> 3.3) - jekyll-swiss (0.4.0) - jekyll-theme-architect (0.1.1) - jekyll (~> 3.5) + jekyll-seo-tag (2.8.0) + jekyll (>= 3.8, < 5.0) + jekyll-sitemap (1.4.0) + jekyll (>= 3.7, < 5.0) + jekyll-swiss (1.0.0) + jekyll-theme-architect (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-cayman (0.1.1) - jekyll (~> 3.5) + jekyll-theme-cayman (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-dinky (0.1.1) - jekyll (~> 3.5) + jekyll-theme-dinky (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-hacker (0.1.1) - jekyll (~> 3.5) + jekyll-theme-hacker (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-leap-day (0.1.1) - jekyll (~> 3.5) + jekyll-theme-leap-day (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-merlot (0.1.1) - jekyll (~> 3.5) + jekyll-theme-merlot (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-midnight (0.1.1) - jekyll (~> 3.5) + jekyll-theme-midnight (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-minimal (0.1.1) - jekyll (~> 3.5) + jekyll-theme-minimal (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-modernist (0.1.1) - jekyll (~> 3.5) + jekyll-theme-modernist (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-primer (0.5.3) - jekyll (~> 3.5) + jekyll-theme-primer (0.6.0) + jekyll (> 3.5, < 5.0) jekyll-github-metadata (~> 2.9) jekyll-seo-tag (~> 2.0) - jekyll-theme-slate (0.1.1) - jekyll (~> 3.5) + jekyll-theme-slate (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-tactile (0.1.1) - jekyll (~> 3.5) + jekyll-theme-tactile (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-theme-time-machine (0.1.1) - jekyll (~> 3.5) + jekyll-theme-time-machine (0.2.0) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) - jekyll-titles-from-headings (0.5.1) - jekyll (~> 3.3) + jekyll-titles-from-headings (0.5.3) + jekyll (>= 3.3, < 5.0) jekyll-watch (2.2.1) listen (~> 3.0) - jemoji (0.10.2) + jemoji (0.12.0) gemoji (~> 3.0) html-pipeline (~> 2.2) - jekyll (~> 3.0) - kramdown (1.17.0) - liquid (4.0.0) - listen (3.1.5) - rb-fsevent (~> 0.9, >= 0.9.4) - rb-inotify (~> 0.9, >= 0.9.7) - ruby_dep (~> 1.2) + jekyll (>= 3.0, < 5.0) + kramdown (2.3.2) + rexml + kramdown-parser-gfm (1.1.0) + kramdown (~> 2.0) + liquid (4.0.3) + listen (3.7.1) + rb-fsevent (~> 0.10, >= 0.10.3) + rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.3.6) - mini_portile2 (2.4.0) - minima (2.5.0) - jekyll (~> 3.5) + minima (2.5.1) + jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) - minitest (5.12.2) + minitest (5.17.0) multipart-post (2.1.1) - nokogiri (1.10.8) - mini_portile2 (~> 2.4.0) - octokit (4.14.0) + nokogiri (1.16.5-x86_64-linux) + racc (~> 1.4) + octokit (4.22.0) + faraday (>= 0.9) sawyer (~> 0.8.0, >= 0.5.3) pathutil (0.16.2) forwardable-extended (~> 2.6) - public_suffix (3.1.1) - rb-fsevent (0.10.3) - rb-inotify (0.10.0) + public_suffix (4.0.7) + racc (1.7.3) + rb-fsevent (0.11.1) + rb-inotify (0.10.1) ffi (~> 1.0) - rouge (3.11.0) - ruby-enum (0.7.2) - i18n - ruby_dep (1.5.0) - rubyzip (2.0.0) + rexml (3.3.9) + rouge (3.26.0) + ruby2_keywords (0.0.5) + rubyzip (2.3.2) safe_yaml (1.0.5) sass (3.7.4) sass-listen (~> 4.0.0) @@ -230,30 +256,38 @@ GEM sawyer (0.8.2) addressable (>= 2.3.5) faraday (> 0.8, < 2.0) + simpleidn (0.2.1) + unf (~> 0.1.4) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) thread_safe (0.3.6) - typhoeus (1.3.1) + typhoeus (1.4.0) ethon (>= 0.9.0) - tzinfo (1.2.5) + tzinfo (1.2.10) thread_safe (~> 0.1) - tzinfo-data (1.2019.3) + tzinfo-data (1.2022.1) tzinfo (>= 1.0.0) - unicode-display_width (1.6.0) + unf (0.1.4) + unf_ext + unf_ext (0.0.8.1) + unicode-display_width (1.8.0) wdm (0.1.1) + webrick (1.8.2) + zeitwerk (2.6.6) PLATFORMS - ruby + x86_64-linux DEPENDENCIES - github-pages (~> 201) - jekyll (~> 3.8.5) - jekyll-feed (~> 0.11) + github-pages + jekyll-feed + jekyll-paginate kramdown rouge tzinfo (~> 1.2) tzinfo-data wdm (~> 0.1.1) + webrick BUNDLED WITH - 2.0.2 + 2.3.8 diff --git a/README.md b/README.md index 571bd2d..5797b07 100644 --- a/README.md +++ b/README.md @@ -15,3 +15,24 @@ The types of content published should generally be technology oriented, but discussions about organization culture, collaboration, and process are welcome so long as they pass the bar of: "would this be interesting to somebody we would want to work with us?" + +# Local build + +``` +# If you don't have Ruby 2.6 installed +# jekyll-sass-converter requires Ruby version >= 2.4.0 +# Default MacOS ruby version is 2.3 +# Ruby < 2.6 are not maintained anymore +# Ruby 2.7 prints bunch of warnings for Jekyll < 3.8.7 +# Using Jekyll 3.8.7 requires bumping github-pages and jekyll-feed +brew install ruby@2.6 +echo 'export PATH="/usr/local/opt/ruby@2.6/bin:$PATH"' >> "$HOME/.bash_profile" +source "$HOME/.bash_profile" + +sudo gem install bundler # if you don't have bundler installed +bundle config set path vendor +bundle install + +bundle exec jekyll serve --livereload # for auto-updading +open http://localhost:4000 +``` diff --git a/_category/data-science.md b/_category/data-science.md index aa400d2..073e6db 100644 --- a/_category/data-science.md +++ b/_category/data-science.md @@ -1,4 +1,4 @@ --- -team: Data Science +team: Applied Research permalink: "/blog/category/data-science" --- diff --git a/_category/internal-tools.md b/_category/internal-tools.md new file mode 100644 index 0000000..449f6fa --- /dev/null +++ b/_category/internal-tools.md @@ -0,0 +1,4 @@ +--- +team: Internal Tools +permalink: "/blog/category/internal-tools" +--- diff --git a/_category/recommendations.md b/_category/recommendations.md new file mode 100644 index 0000000..8e928c4 --- /dev/null +++ b/_category/recommendations.md @@ -0,0 +1,4 @@ +--- +team: Recommendations +permalink: "/blog/category/recommendations" +--- diff --git a/_config.yml b/_config.yml index 6c2231a..3e577cc 100644 --- a/_config.yml +++ b/_config.yml @@ -7,7 +7,7 @@ description: >- # this means to ignore newlines until "baseurl:" baseurl: "" # the subpath of your site, e.g. /blog url: "https://tech.scribd.com/" # the base hostname & protocol for your site, e.g. http://example.com google_analytics: 'UA-443684-30' -featured_series: 'airflow-series' +featured_series: 'kyc-series' # GitHub Metadata # Used for "improve this page" link diff --git a/_data/authors.yml b/_data/authors.yml index 4ff976d..69d8120 100644 --- a/_data/authors.yml +++ b/_data/authors.yml @@ -3,6 +3,13 @@ # description, etc --- +bshaw: + name: Ben Shaw + github: benshaw + twitter: ben_a_shaw + about: | + Ben leads the ML Platform group, helping scale production Machine Learning at scribd. Other times you will find him outside playing in the mountains. + alexjb: name: Alex Bernardin github: alexofmanytrades @@ -102,7 +109,71 @@ maksymd: name: Maksym Dovhal github: Maks-D -Kuntalb: +kuntalb: name: Kuntal Kumar Basu github: kuntalkumarbasu +alexk: + name: Alex Kushnir + github: shtusha + +nakulpathak3: + name: Nakul Pathak + github: nakulpathak3 + twitter: nakulpathak3 + blog: https://nakulpathak3.github.io + about: | + Nakul works on Sidekiq, Terraform-ing AWS Fargate, Ruby and Rails upgrades, and monolith migrations at Scribd. + +ajhofmann: + name: Adam Hofmann + github: ajhofmann + +trupin: + name: Theo Rupin + github: trupin + +div: + name: Div Dasani + github: divdasani + about: | + Div is a Machine Learning Engineer on the Recommendations team, working on personalized + recommendations and online faceted search. + +gregr: + name: Greg Reznik + github: imfromthebay + + +jonathanr: + name: Jonathan Ramkissoon + twitter: _JRamkissoon + github: jramkiss + blog: https://jramkiss.github.io/ + about: | + Jonathan is a data scientist on the Applied Research team building machine learning models to understand and connect our content. + +antoniam: + name: Antonia Mouawad + github: AntoniaMouawad + about: | + Antonia is a data scientist on the Applied Research team building machine learning models to understand and connect our content. + +nathans: + name: Nathan Sass + github: NathanSass + about: | + Nathan is a software engineer on the Android platform team. + +rafaelp: + name: Rafael Lacerda + github: lacerda + blog: https://blog.lacerda.ch/ + about: | + Rafael is a data scientist on the Applied Research team building machine learning models to understand and connect our content. + +moniquec: + name: Monique Alves Cruz + github: MAlvesCruz + about: | + Monique is a data scientist on the Applied Research team building machine learning models to understand and connect our content. diff --git a/_data/benefits.yml b/_data/benefits.yml index 01c8b08..6ab2250 100644 --- a/_data/benefits.yml +++ b/_data/benefits.yml @@ -10,7 +10,7 @@ - name: Paid Parental Leave description: "100% pay for the first 6 weeks of leave for the birth, adoption, or foster placement of a child." -- name: Visa Sonsorship +- name: Visa Sponsorship description: "Looking to move to the U.S.? We’ll sponsor your visa to help you get settled." - name: PTO + Holiday Week diff --git a/_data/team-structure.yml b/_data/team-structure.yml index 006f0e4..90f26e4 100644 --- a/_data/team-structure.yml +++ b/_data/team-structure.yml @@ -26,9 +26,9 @@ about titles in our library by analyzing content and user behavior and building predictive models. -- team: Data Science +- team: Applied Research description: | - The Data Science team drives decisions by creating insights into the product + The Applied Research team drives decisions by creating insights into the product and improve the user experience with machine learning. - team: Core Platform @@ -55,10 +55,11 @@ - team: Core Infrastructure description: | - The Infrastructure team’s mission is to provide a high quality low-level - infrastructure shared between all engineering efforts. Rather than focusing - on supporting any individual application – including scribd.git – this team - focuses on infrastructure used by all, or nearly all projects at Scribd. + The Infrastructure team's mission is to provide secure and reliable cloud + infrastructure shared between all engineering efforts with a focus on efficient + automation and self-service. Our vision is an integrated set of standardized + solutions that empower service ownership by facilitating and promoting DevOps + practices within Scribd. - team: Security Engineering description: | @@ -74,7 +75,7 @@ - team: iOS description: | The iOS team's mission is to deliver a performant, stable and feature-rich - Android application. + iOS application. - team: Web Development description: | @@ -90,3 +91,8 @@ description: | The Web QA team strives for a defect-free Scribd website known for its reliability. + +- team: Service Foundations + description: | + The Service Foudations team provides reliable, high-quality, scalable service foundations + that teams can leverage to easily build, deploy and monitor self-owned, distributed services. diff --git a/_data/teams.yml b/_data/teams.yml index 58cd1e9..56a4665 100644 --- a/_data/teams.yml +++ b/_data/teams.yml @@ -3,22 +3,22 @@ # plumbing jobs into team's blog posts --- iOS: - # the category in Lever - lever: 'iOS' + lever: 'Mobile' Android: - lever: 'Android' + lever: 'Mobile' + +Applied Research: + lever: 'Data Science' Data Science: - lever: 'Data Science - San Francisco' + lever: 'Data Science' Core Platform: lever: 'Core Platform' Data Engineering: - # No clue why these jobs are grouped with Core Platform in Lever, but not - # really important to fix at the moment - lever: 'Core Platform' + lever: 'Data Engineering' Core Infrastructure: lever: 'Core Infrastructure' @@ -44,3 +44,19 @@ Web Development: Security Engineering: lever: 'Security Engineering' + +Internal Tools: + lever: 'Internal Tools' + +Recommendations: + lever: 'Recommendations' + about: | + The Recommendations team at Scribd wants to inspire users to read more and discover + new content and topics. Our team comprises of Machine Learning and Software Engineers, + Product Managers, Data Scientists, and QA and Project Managers, all of whom have the + shared passion of building the world's best recommendation engine for books. We pride + ourselves on using a variety of open-source technologies to develop and productionize + state of the art machine learning solutions. + +IT: + lever: 'IT' \ No newline at end of file diff --git a/_includes/related-jobs.html b/_includes/related-jobs.html index d350d91..e6ec8f4 100644 --- a/_includes/related-jobs.html +++ b/_includes/related-jobs.html @@ -13,6 +13,8 @@
Related Jobs diff --git a/_includes/team-color-logic.html b/_includes/team-color-logic.html index 76dac8b..e87eb10 100644 --- a/_includes/team-color-logic.html +++ b/_includes/team-color-logic.html @@ -9,7 +9,7 @@ {% elsif page.team == "Android" %} {% assign theme = 'grass' %} - {% elsif page.team == "Data Science" %} + {% elsif page.team == "Applied Research" %} {% assign theme = 'slate' %} {% elsif page.team == "Web Development" %} diff --git a/_layouts/home.html b/_layouts/home.html index b92aa95..3439c1f 100644 --- a/_layouts/home.html +++ b/_layouts/home.html @@ -21,7 +21,7 @@

Help us build our next project.

-

We're on a mission to change the way the world reads. That's an ambitious task, and we need ambitious people to get us there. See what positions are available and start your next chapter today.

+

We're on a mission to build the largest and most accessible library connecting storytellers with their audience. That's an ambitious task, and we need ambitious people to get us there. See what positions are available and start your next chapter today.

All Positions diff --git a/_posts/2018-01-05-neural-spelling-corrections.md b/_posts/2018-01-05-neural-spelling-corrections.md index 46205af..9bbbc99 100644 --- a/_posts/2018-01-05-neural-spelling-corrections.md +++ b/_posts/2018-01-05-neural-spelling-corrections.md @@ -5,7 +5,7 @@ author: mattr tags: - seq2seq - data -team: Data Science +team: Applied Research --- Introduction diff --git a/_posts/2018-02-12-search-query-parsing.md b/_posts/2018-02-12-search-query-parsing.md index c3937e9..49e4498 100644 --- a/_posts/2018-02-12-search-query-parsing.md +++ b/_posts/2018-02-12-search-query-parsing.md @@ -5,7 +5,7 @@ author: mattr tags: - search - data -team: Data Science +team: Applied Research --- Scribd has a variety of content to offer and connecting our users with their desired content is a crucial aspect of our product. One of the main ways that users find content on Scribd is through search, and in this post I want to delve into an analysis we did regarding parsing out valuable information from a user’s query in order to better serve them relevant results, and also learn more about what they are searching for. diff --git a/_posts/2018-03-20-scribds-ab-testing.md b/_posts/2018-03-20-scribds-ab-testing.md index 2b3e3ad..59a21d0 100644 --- a/_posts/2018-03-20-scribds-ab-testing.md +++ b/_posts/2018-03-20-scribds-ab-testing.md @@ -5,7 +5,7 @@ author: dfeldman tags: - testing - data -team: Data Science +team: Applied Research --- What is A/B testing? diff --git a/_posts/2018-04-18-bandits-for-the-win.md b/_posts/2018-04-18-bandits-for-the-win.md index d0a0734..a70db49 100644 --- a/_posts/2018-04-18-bandits-for-the-win.md +++ b/_posts/2018-04-18-bandits-for-the-win.md @@ -5,7 +5,7 @@ author: dfeldman tags: - testing - data -team: Data Science +team: Applied Research --- We love A/B testing at Scribd. What follows is a specific example to give you an inside look at the process from idea to implementation for an algorithm test. diff --git a/_posts/2018-05-31-non-random-seo-test.md b/_posts/2018-05-31-non-random-seo-test.md index b115427..262d008 100644 --- a/_posts/2018-05-31-non-random-seo-test.md +++ b/_posts/2018-05-31-non-random-seo-test.md @@ -6,7 +6,7 @@ tags: - seo - testing - data -team: Data Science +team: Applied Research --- Months ago, your friends convinced you to sign up for a half marathon. With three weeks to go, you haven’t even started training. In a growing panic, you turn to the internet for answers. diff --git a/_posts/2019-02-07-calculating-customer-lifetime-revenue.md b/_posts/2019-02-07-calculating-customer-lifetime-revenue.md index cbb708a..d4c679e 100644 --- a/_posts/2019-02-07-calculating-customer-lifetime-revenue.md +++ b/_posts/2019-02-07-calculating-customer-lifetime-revenue.md @@ -5,7 +5,7 @@ author: bclearly tags: - ltr - data -team: Data Science +team: Applied Research --- Why LTR? (Lifetime Revenue) diff --git a/_posts/2019-03-04-experiments-with-seq2seq.md b/_posts/2019-03-04-experiments-with-seq2seq.md index 8f3beac..ff10bb4 100644 --- a/_posts/2019-03-04-experiments-with-seq2seq.md +++ b/_posts/2019-03-04-experiments-with-seq2seq.md @@ -6,7 +6,7 @@ tags: - machinelearning - seq2seq - data -team: Data Science +team: Applied Research --- How much data do you need to train a seq2seq model? Let’s say that you want to translate sentences from one language to another. You probably need a bigger dataset to translate longer sentences than if you wanted to translate shorter ones. How does the need for data grow as the sentence length increases? diff --git a/_posts/2020-04-29-monitoring-aws-with-panther.md b/_posts/2020-04-29-monitoring-aws-with-panther.md index 6866b4d..da5ceec 100644 --- a/_posts/2020-04-29-monitoring-aws-with-panther.md +++ b/_posts/2020-04-29-monitoring-aws-with-panther.md @@ -5,10 +5,13 @@ tags: - monitoring - aws - featured +- archived team: Security Engineering author: paha --- +***NOTE***: *Scribd’s security infrastructure has since evolved away from using Panther* + Before widespread cloud usage, it was uncommon for one person to be present for the entire datacenter development lifecycle. Very few people knew how to design and build a datacenter from scratch while ensuring appropriate security configuration settings were set, on top of rigging up monitoring. It was even more uncommon for non-sysadmins to have any involvement in data center infrastructure construction or ongoing refinement. The cloud is very different. It only takes seconds to create an entire infrastructure from a template. And even developers are doing it! The monitoring challenges for such a scenario are significant. There aren't necessarily "more" monitoring data points, but the speed with which infrastructure can be created tends to result in infrastructure getting way out over its skis with respect to monitoring. Furthermore, since many barriers to entry for doing stupid things have been lowered to the point of non-existence, monitoring is the last great hope of maintaining control over a cloud environment. While access controls can still provide some guardrails, the flexibility that all engineers need to do their jobs requires that they have the ability to do "dangerous" things that they've never had to do before. The true definition of "full stack" has expanded. diff --git a/_posts/2020-12-21-sidekiq-incident-learnings.md b/_posts/2020-12-21-sidekiq-incident-learnings.md new file mode 100644 index 0000000..b09f71c --- /dev/null +++ b/_posts/2020-12-21-sidekiq-incident-learnings.md @@ -0,0 +1,82 @@ +--- +layout: post +title: "Learning from incidents: getting Sidekiq ready to serve a billion jobs" +author: nakulpathak3 +tags: +- incident response +- sidekiq +- monitoring +- featured +team: Internal Tools +--- + +Scribd currently serves hundreds of Sidekiq jobs per second and has served 25 billion jobs since its adoption 2 years ago. Getting to this scale wasn’t easy. In this post, I’ll walk you through one of our first ever [Sidekiq](https://sidekiq.org/) incidents and how we improved our Sidekiq implementation as a result of this incident. + +### The Incident + +A large number of jobs for importing podcasts into Scribd were enqueued via Sidekiq. They took many hours to run and since they were added to our “default” queue, all our servers picked them up unlike if they were in the “bulk” queue. These jobs quickly starved all other jobs including the highest priority ones. + +**Detection:** The incident was detected by an internal user noticing the queue build-up in Sidekiq’s web UI and a corresponding customer complaint that we linked back to this issue. Our systems were negatively affected for around **7 hours** and the incident was noticed at the 6 hour mark. + +**Resolution:** We ran a script on production to delete all existing jobs of this problematic worker from Sidekiq’s Redis instance and removed the batch job that was enqueuing them. We let the currently running jobs finish since killing them would require ssh-ing and running risky sudo commands on production servers. + +### What we learned + +As pretty much our first ever major Sidekiq incident, we wrote an in-depth incident review that focused on 4 problem areas - + +#### Quicker Detection + +Our mean-time-to-detect this incident was way too high. To address this, we needed metrics and alerting. Since we have a Sidekiq Enterprise license, we simply integrated the [Pro](https://github.com/mperham/sidekiq/wiki/Pro-Metrics#enabling-metrics) and [Enterprise metrics](https://github.com/mperham/sidekiq/wiki/Ent-Historical-Metrics) into our existing Ruby Dogstatsd client. + +We added the following Datadog monitors - + +* X queue latency > Y value over past Z minutes +* % of job failures / total jobs > X% over last Y minutes + +#### Quicker Debugging + +To help add some debugging power to the monitors above, we also created some useful dashboards. +Overall Sidekiq dashboard +Overall worker metrics + +We added Sidekiq system-level, queue-level, and worker-level graphs that allow us to quickly go from system health to queue health to erroneous worker. From there, we can go over to the worker dashboard to find out whether the issue is around processing time or job failures and debug further in Sentry if needed. + +Dashboard for each worker + +Later, as Scribd adopted Datadog further, we added [APM for Sidekiq](https://docs.datadoghq.com/tracing/setup_overview/setup/ruby/#sidekiq) which covered a lot of the functionality we had but also added tracing of worker performance to further debug issues. + +#### Quicker Resolution + +Now that we’re able to quickly identify incidents and debug them, the next step is to resolve the issue. + +Something we learned from the incident was that editing Sidekiq Redis while it is already overloaded is a slow and highly error-prone process. To overcome this, we utilized Sidekiq’s ability to [inject custom middlewares](https://github.com/mperham/sidekiq/wiki/Middleware). + +**Job Dropping Middleware:** We created a client middleware that would check a worker’s name against a live feature flag sidekiq_dropped_workers to decide if that worker should execute or be dropped pre-execution. This allowed us to “drain” a specific worker without having to manually edit Sidekiq Redis. + +Flow diagram for dropping Sidekiq worker jobs + +**Job Disabling Middleware:** In some cases, the worker’s issues may be easily resolvable in an upcoming deploy or re-enqueuing the workers may be extremely difficult. To address such a case, we introduced sidekiq_disabled_workers feature flag which utilized Sidekiq’s [ScheduledSet](https://github.com/mperham/sidekiq/wiki/Scheduled-Jobs) to return those jobs to Redis to be run 24 hours later. + +Flow diagram for temporarily disabling Sidekiq worker jobs + +**Job Termination Incidents Page:** Finally, it was essential to find a way to quickly terminate existing problematic workers that have overtaken the queue. Sidekiq’s web UI is also [quite extensible](https://github.com/mperham/sidekiq/issues/3335) so we added a new web page called the “Incidents” tab which allows us to pause queues and terminate existing processes. + +Sidekiq incidents tab in web UI + +#### Future prevention + +The team that added the problematic worker was not aware of Sidekiq’s shared model of usage and their worker’s ability to affect the system. They didn’t know when they should be using the default queue or the bulk queue. + +**Documentation:** We created processing time and worker importance expectations for each queue. We listed best practices such as using timeouts, preferring multiple smaller jobs, idempotency, etc. and we linked to the [Sidekiq documentation](https://github.com/mperham/sidekiq/wiki/Best-Practices) where we felt people may want more information. + +**Runbook:** We also created an Incident Handling runbook that walks people through finding a problematic worker, debugging, and resolving the incident. + +Sidekiq runbook for incident handling + +**Guardrails:** We also seriously considered adding timeouts which would forcefully terminate workers that go significantly over their queue’s expected processing time. However, we settled for a Sentry exception for workers that missed our guidelines auto-assigned to the team that owns the worker (via CODEOWNERS file). This approach has been sufficient for us so far. + +### Where we are now + +Our systems are far from perfect but Sidekiq issues are now recognized within 5-10 minutes of their occurrence and usually resolved with no significant production impact. + +When we addressed these incidents, we were running on data center servers but since then we’ve moved our workloads to AWS Fargate tasks. We’d like to add queue-based auto-scaling and the ability for degradation in database performances caused by Sidekiq workers to be recognizable and auto-resolve. diff --git a/_posts/2021-03-02-github-actions-datadog-reporting.md b/_posts/2021-03-02-github-actions-datadog-reporting.md new file mode 100644 index 0000000..50fd811 --- /dev/null +++ b/_posts/2021-03-02-github-actions-datadog-reporting.md @@ -0,0 +1,55 @@ +--- +layout: post +title: "Unifying developer velocity metrics in Datadog with GitHub Actions" +author: ajhofmann +tags: +- monitoring +- datadog +- featured +team: Internal Tools +--- + +At Scribd we have a wide variety of projects and repositories that our developers work on everyday. The Internal Tools team is dedicated to creating tooling and automation that empowers developers to deliver code as swiftly as possible. A standardized and unified method to report metrics around developer velocity and CI/CD is therefore key to being able to identify areas for improvement and measure success in improving developer workflows. + +### GitHub Actions + +[GitHub Actions](https://github.com/features/actions) offers a CI/CD solution to build, test and deploy code directly in GitHub. One of the key features of GitHub Actions is the ability to create an open source action that can be easily used by any other GitHub Actions workflow in a few lines. The current actions on the market range from installing languages like [Ruby](https://github.com/ruby/setup-ruby), [posting messages to Slack](https://github.com/abinoda/slack-action) and all sorts of other [awesome things](https://github.com/sdras/awesome-actions). Some actions provide the ability to report [custom datadog metrics](https://github.com/marketplace/actions/datadog-action) from a workflow, however there weren't any actions that automatically collected, formatted and reported development or developer velocity metrics to Datadog. + +### Datadog Reporting in GitHub Actions + +Without a solution on the [GitHub Actions marketplace](https://github.com/marketplace?type=actions) to accomplish what we wanted, the Internal Tools team created a GitHub Action that could be used across all of Scribd’s projects and teams to report metrics that give us a view of how fast we are able to deliver from the organization level all the way down to specific projects. + +With our now published [open source GitHub Action](https://github.com/scribd/github-action-datadog-reporting) we provide the ability for a quick lightweight job to be added to the end of any GitHub workflow that reports the duration of every job and the entire duration of the workflow directly to [Datadog](https://www.datadoghq.com/). The action can also be integrated into a standalone workflow that calculates and reports how long pull requests take to open, how many lines are changed, and how long the pull request takes to move from open to merge. + +Additionally, all of the metrics are automatically tagged by the Action with information such as whether the durations are from passed or failed jobs, as well as the repository, workflow and job that the durations correspond to. This information allows us to create fully customizable Datadog dashboards that can focus on the velocity of the organization, a single team, a project and even a single job in the workflow. + +### Putting the Data to Use + +Going forward, these unified metrics across the projects will enable the Internal Tools team to identify potential areas of slow down for developers at Scribd, and measure the success of our efforts to enable developers to ship clean and correct code as efficiently as possible. + +When all metrics are reported using the same prefix to Datadog, we can leverage tags and templates to easily make dashboards of any scope, from tracking the organization's velocity, all the way down to that of a single project or workflow. When not filtering anything we can see the developer velocity events across all installed projects: + +Three graphs showing the data for time to merge and open pull requests, and lines changed per pull request + + +The time to merge metrics supports tags for team and repository, and so we can easily add filters based on the tags for any single tag or combination of tags. + +The Tools teams time to merge graph + +One of the key features of the action is tracking job and workflow timing. + +A graph showing significant inrease in workflow duration + +A graph showing a stable or decreasing workflow duration + +The above graphs represents workflow runtime data collected from two different projects. By filtering the reports using the project and workflow tag, we can watch the workflows for any trends that might be slowing down the workflow and track when the issue started and how significant it is. In the above example it looks like the “First Workflow” is having some performance issues, so let’s break the job duration metric down by the jobs in the workflow. + +Four graphs showing job duration, with one showing a significant recent increase + +Looking at the job breakdown makes it very clear where our issue is, Job 3 is causing some performance issues in the workflow starting from friday morning, and will need to be fixed. Note that the above graphs have had their workflow and job names obscured for the purposes of the blog. + +### Into the Future + +With the GitHub Action published on the public marketplace, Scribd will continue to integrate the action across it’s projects for increased monitoring and issue tracking. The code is now [open sourced and available on GitHub](https://github.com/scribd/github-action-datadog-reporting) and contributions are welcome. + + If you would like to join the Internal Tools team or Scribd on our journey then take a look at our careers page. diff --git a/_posts/2021-03-11-introducing-sql-delta-import.md b/_posts/2021-03-11-introducing-sql-delta-import.md new file mode 100644 index 0000000..ec708f9 --- /dev/null +++ b/_posts/2021-03-11-introducing-sql-delta-import.md @@ -0,0 +1,161 @@ +--- +layout: post +title: "Importing MySQL Data into Delta Lake" +author: alexk +tags: +- databricks +- spark +- deltalake +- featured +team: Data Engineering +--- + +OLTP databases are a common data source for Data Lake based warehouses which use Big Data tools to run +batch analytics pipelines. The classic Apache Hadoop toolchain includes +[Apache Sqoop](https://sqoop.apache.org/) - a tool for bulk import/export +of data between HDFS and relational data stores. Our pipelines were using this tool as well, primarily +to import MySQL data into HDFS. When the Platform Engineering team took on the migration of +our on-premise Hadoop workloads to the [Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse) +on AWS, we had to write our own tool to import data from MySQL directly into S3-backed [Delta Lake](https://delta.io/). +In this post I will share the details about `sql-delta-import` - an open source utility we have proposed for inclusion in the +[Delta Lake +Connectors](https://github.com/delta-io/connectors/pull/80) project, we're +looking forward to working with others to improve and accelerate importing data +into Delta Lake! + +### Sample import + +Importing data into a Delta Lake table is as easy as + +```sh +spark-submit / +--class "io.delta.connectors.spark.JDBC.ImportRunner" sql-delta-import_2.12-0.2.1-SNAPSHOT.jar / +--jdbc-url jdbc:mysql://hostName:port/database / +--source source.table +--destination destination.table +--split-by id +``` + +### This looks a lot like `sqoop`... why didn't you just use that? + +We considered using `sqoop` at first but quickly dismissed that option for multiple reasons: + +#### 1. Databricks Lakehouse Platform does not come with `sqoop` +Yes we could have ran our sqoop jobs on EMR clusters but we wanted to run everything in Databricks and +avoid additional technology footprint and overhead. But even if we drop that restriction... + +#### 2. `sqoop` does not support writing data directly to Delta Lake +`sqoop` can only import data as text or parquet. Writing to delta directly allows us to +optimize data storage for best performance on reads by just adding a couple of configuration options + +```sh +spark-submit / +--conf spark.databricks.delta.optimizeWrite.enabled=true / +--conf spark.databricks.delta.autoCompact.enabled=true / +--class "io.delta.connectors.spark.JDBC.ImportRunner" sql-delta-import_2.12-0.2.1-SNAPSHOT.jar / +--jdbc-url jdbc:mysql://hostName:port/database / +--source source.table +--destination destination.table +--split-by id +``` + +#### 3. `--num-mappers` just not good enough to control parallelism when working with a database +`sqoop` uses map-reduce under the hood. We can specify `--num-mappers` parameter that controls how many +mappers will be used to import data. Small number of mappers can result in large volume +of data per import and long running transactions. Large number of mappers will result in many connections +to database potentially overloading it especially when there are a lot of `sqoop` jobs running in parallel. +Additionally since there are no reduce stages in `sqoop` jobs large number of mappers will result in large +number of output files and potentially introducing a small files problem. + +`sql delta import` uses `--chunks` parameter to control number of... well... chunks to split the source table +into and standard spark parameters like `--num-executors` and `--executor-cores` to control data import +concurrency thus allowing you to tune those parameters independently + +```sh +spark-submit --num-executors 15 --executor-cores 4 / +--conf spark.databricks.delta.optimizeWrite.enabled=true / +--conf spark.databricks.delta.autoCompact.enabled=true / +--class "io.delta.connectors.spark.JDBC.ImportRunner" sql-delta-import_2.12-0.2.1-SNAPSHOT.jar / +--jdbc-url jdbc:mysql://hostName:port/database / +--source source.table +--destination destination.table +--split-by id +--chunks 500 +``` + +in the example above source table will be split into 500 chunks resulting in quick transactions and released connections +but no more than 60 concurrent connections will be used for import since max degree of parallelism is 60 (15 executors x 4 cores). +`delta.optimizeWrite` and `delta.autoCompact` configuration will yield optimal file size output for the destination table + +#### 3.1 `--num-mappers` and data skew just don't play nicely together + +When `sqoop` imports data, source table will be split into ranges based on `--split-by` column and each mapper +would import its corresponding range. This works well when `--split-by` column has a near uniform distribution +of data, but that's not always the case with source tables... As tables age we tend to add additional columns to them to +take on new business requirements so over time data in latest rows has a higher fill rate than earlier rows. + +![row density increase over time](/post-images/2021-03-sql-delta-import/row_density_increase.png) + +Our source tables here at Scribd definitely have these characteristics. We also have some tables that have entire +ranges of data missing due to data cleanup. At some point large chunks of data were just deleted from these tables. + +![missing rows](/post-images/2021-03-sql-delta-import/missing_rows.png) + +This type of data skew will result in processing time skew and output file size skew when you can only control number of +mappers. Yes we can introduce additional computed synthetic column in the source table as our `split-by` column but now +there is an additional column that does not add business value, app developers need to be aware of it, computing and +storing it takes up database resources and if we plan to use it for imports it's better be indexed, thus even more +compute and storage resources. + +With `sql-delta-import` we still split source tables into ranges based on `--split-by` column but if there is data +distribution skew we can "solve" this problem by making number of chunks much larger than max degree of parallelism. +This way large chunks with high data density are broken up into smaller pieces that a single executor can handle. +Executors that get chunks with little or no data can just quickly process them and move on to do some real work. + + +### Advanced use cases + +For advanced use cases you don't have to use provided spark application directly. `sql-delta-import` +libraries can be imported into your own project. You can specify custom data transformations or JDBC dialect to gain a +more precised control of data type handling + +```scala +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ + +import io.delta.connectors.spark.JDBC._ + +implicit val spark: SparkSession = SparkSession.builder().master("local").getOrCreate() + + +// All additional possible jdbc connector properties described here - https://dev.mysql.com/doc/connector-j/8.0/en/connector-j-reference-configuration-properties.html +val jdbcUrl = "jdbc:mysql://hostName:port/database" + +val config = ImportConfig(source = "table", destination = "target_database.table", splitBy = "id", chunks = 10) + +// a sample transform to convert all timestamp columns to strings +val timeStampsToStrings : DataFrame => DataFrame = source => { + val tsCols = source.schema.fields.filter(_.dataType == DataTypes.TimestampType).map(_.name) + tsCols.foldLeft(source)((df, colName) => + df.withColumn(colName, from_unixtime(unix_timestamp(col(colName)), "yyyy-MM-dd HH:mm:ss.S"))) +} + +// Whatever functions are passed to below transform will be applied during import +val transforms = new DataTransform(Seq( + df => df.withColumn("id", col("id").cast(types.StringType)), //custom function to cast id column to string + timeStampsToStrings //included transform function converts all Timestamp columns to their string representation +)) + +val importer = new JDBCImport(jdbcUrl = jdbcUrl, importConfig = config, dataTransform = transforms) + +importer.run() +``` + +Prior to migrating to Databricks Lakehouse Platform we had roughly 300 `sqoop` jobs. We were able to +successfully port all of them to `sql-delta-import`. Today they happily coexist in production with other spark +jobs allowing us to use uniform set of tools for orchestrating, scheduling, monitoring and logging for all of our jobs. + +If you're interested in working with Delta Lake, the Databricks platform, or +enabling really interesting machine learning use-cases, check out our [careers +page](/careers/#open-positions)! diff --git a/_posts/2021-03-16-introducing-lucid-a-swift-library-for-building-robust-data-flows.md b/_posts/2021-03-16-introducing-lucid-a-swift-library-for-building-robust-data-flows.md new file mode 100644 index 0000000..0f42eb1 --- /dev/null +++ b/_posts/2021-03-16-introducing-lucid-a-swift-library-for-building-robust-data-flows.md @@ -0,0 +1,82 @@ +--- +layout: post +title: "Introducing Lucid: A Swift Library For Building Robust Data Flows" +author: trupin +tags: +- swift +- architecture +- codegen +- featured +team: iOS +--- + +Lucid is a Swift library which provides a series of convenient tools for building robust data layers for applications. + +We built it with three main ideas in mind: + +- **Declarative**: Lucid makes it easy to declare complex data models and provides the tools to use it with plain Swift code. +- **Modularity**: Use the technologies which suits your data flow the best. Lucid gives you the infrastructure to seamlessly integrate the stack you want to use. +- **Adaptability**: Built to fit most kinds of standard and non-standard server APIs, Lucid abstracts away server-side structural decisions by providing a universal client-side API. + +Today we're happy to open source Lucid so that developers around the world can use it in their own applications. + +### Why Lucid? + +At Scribd, the iOS application has always been a huge part of our business. As we kept adding new features to it, the codebase became more and more complex to maintain. One of the biggest hurdles we encountered was an inconsistency in how data was handled throughout the app. + +We decided to tackle this issue by providing a series of tools that would help us handle all of our data flows throughout the app with a single uniform API. This is how Lucid was born. + +Our entire iOS codebase is now migrated to using Lucid for things like fetching data from the servers, storing data to disk, resolving read/write conflicts for data stored locally and remotely, listening to local data changes, etc… + +### What features does Lucid provide? + +The first thing to think about when working with Lucid are the data models. Lucid lets you define your data models in the form of JSON files. Those files are then interpreted by Lucid's command line tool, which generates all of the boilerplate code you'll need to handle those models in your application. + +Once imported in your project, here's what the generated code, coupled with Lucid's framework, can help you with: + +- Read data from your servers, local storage, or both, with a query system supported by a full featured DSL expressed in Swift. +- Write data to your servers, or local storage, or both. +- Listen to local data changes based on queries. +- Standard store implementations using CoreData, in-memory caching, and disk caching. +- Easily create and use your own stores. +- Adaptable to most backend APIs, even those serving data in form of a tree or graph. +- Automatic model relationship(s) fetching. +- External plugin support for code generation. + +### The Design + +![Architecture Diagram](https://docs.google.com/drawings/d/e/2PACX-1vQ1BkNqPJO6dlox3AyQAN2MD066GLZVr7B7MCCldmI1Et-Xnlqzzr5Yxw0_OS5VaDAW3O6jCPILvlj_/pub?w=960&h=720) + +Lucid let you use two main types of objects: + +- **Entity objects**, which are automatically generated from their respective JSON description files. They represent your data. +- **Manager objects**, which provide a uniform API to read/write data to multiple locations. + +Internally, each manager interacts with as many stores as needed. There are two types of stores: + +- **Remote stores**. They represent the servers and directly talk to them via HTTP. +- **Local stores**. They represent a local source of data such as a key/value cache, a Core Data database, etc... + +In short, managers are in charge of synchronizing the data between the stores. Stores are in charge of bridging the data to specific technologies. + +There is much more to discover about Lucid in the [documentation](https://github.com/scribd/Lucid/tree/master/Documentation/Manual). + +### Who is Lucid for? + +Lucid is for developers who don't want to recreate the wheel every time they need to read/write data in their application. With Lucid, you are able to declare your data models once, then chose whichever built-in functionality you need to build a robust data flow. + +Lucid was designed to let you focus on data modeling rather than implementation details. For example, if you decide you want to store your data to disk, you just need to add a single line to your object's JSON description. + +### Where can I find Lucid? + +Lucid is available on [Github](https://github.com/scribd/Lucid) under the MIT license. + +If you like Lucid, you might like other open source projects we developed at Scribd which you can find on our [Github page](https://github.com/scribd). + +### Can I contribute? + +You are more than welcome to contribute to Lucid. You can open a PR or file issues on [Github](https://github.com/scribd/Lucid). Please refer to our [contributions guidelines](https://github.com/scribd/Lucid/blob/master/CONTRIBUTING.md) before doing so. + + +If you're interested in building great mobile applications with us check out our +[careers page](/careers/#open-positions)! diff --git a/_posts/2021-03-18-faster-fargate-deploys.md b/_posts/2021-03-18-faster-fargate-deploys.md new file mode 100644 index 0000000..a6e8b2f --- /dev/null +++ b/_posts/2021-03-18-faster-fargate-deploys.md @@ -0,0 +1,60 @@ +--- +layout: post +title: "Speeding up ECS Fargate deployments" +author: nakulpathak3 +tags: +- aws +- deploys +- featured +team: Internal Tools +--- + +Scribd moved its monolith to AWS in April 2020 and as part of the migration, we had to design and implement a deployment pipeline for our new (and *shiny*) [ECS Fargate](https://aws.amazon.com/fargate/) infrastructure. In this post, we'll share how we improved our deployment speeds from ~40 minutes to less than 20 minutes. + +### Original Implementation + +Our starting implementation involved a few steps: +- Deploying assets via [Capistrano](https://capistranorb.com/) to our asset-hosting servers *(2.5 minutes)* +- Executing a Fargate task to run any database migrations *(3 minutes)* +- Restarting and waiting on ~500 Fargate tasks via the AWS CLI *(32-35 minutes)* + +### Improvements + +#### Fargate Service Updates +By far, the slowest part of our deployment was waiting for ECS services to finish updating. We use the default rolling deployment which stops and starts tasks to trigger a re-pulling of the freshly-uploaded [ECR](https://aws.amazon.com/ecr/) image. Here are some changes we implemented - + +* **Docker Image Size Reduction** - The first thing everyone thinks of when considering ECS Fargate speedups is how to reduce the image pull time since Fargate (unlike EC2) [has no image caching](https://github.com/aws/containers-roadmap/issues/696). However, unless you can drastically reduce your image size (think 1Gb to 100Mb), this will not lead to significant time reductions. We reduced our compressed image size from ~900Mb to ~700Mb and it led to **little to no improvement**. It did lead to a cleaner image but that wasn't our initial goal. + +* [**Deregistration Delay**](https://docs.aws.amazon.com/elasticloadbalancing/latest/application/load-balancer-target-groups.html#deregistration-delay) - This is a property on a load balancer's target group that dictates how long a task stays in *Draining* state after it stops receiving requests. We looked at Datadog APM for the p99 of our longest running request and set the delay to 17s from the **default of 300s**. This reduced service refreshes to ~22 minutes. + +* **ECS Throttling** - During deployments, we investigated the "Events" tab of our main web ECS service. There were events with the following messages - + - *"service production-web operations are being throttled on elb. Will try again later."* + - *"service production-web operations are being throttled. Will try again later."* + + Due to Scribd's high Fargate task volume, the number of start and stop requests we were making was too high for AWS' default limits. We opened support tickets with the ELB and Fargate teams to get those limits increased. This further reduced service deploy time to 16-18 minutes. + +* **Network Load Balancer Health Checks** - From testing in staging, we noticed that reducing our network load balancer's [health-check intervals and thresholds](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/target-group-health-checks.html) helped reduce staging deploy time from ~9 to ~6 minutes. However, it only translated to 1-2 minutes saved in production with much higher number of ECS tasks. You do want to be careful with the value to avoid false-positive health checks and keep in mind that updating these values requires re-creation of the ECS service the load balancer points to. + +#### Asset Deployment Improvements +Our asset deployments were run using Capistrano. The job `ssh`-ed onto our asset servers, ran a series of [Rake tasks]((https://guides.rubyonrails.org/v4.2/command_line.html#rake)) to download, unzip, and correctly place assets. There were some issues with this approach - +* Dependency on Capistrano gem forced us to use the monolith Docker image as the job's base image +* Running Rake tasks required loading the application which adds time to the job +* Our ECS service refresh job runs `docker push/pull` tasks to upload the latest image to ECR. This forced us to have separate jobs for asset and service deployments to avoid adding a Docker dependency to the monolith image. + +To resolve these issues, we decided to remove Capistrano & Rake as dependencies and wrote pure Ruby and Bash code to perform the tasks. This unified the two jobs and brought asset deploy time from 2.5 minutes to 30s. + +#### Database Migration +In our case, running a database migration task in Fargate involved starting a new task instance of our `database_migration` task family. Due to Fargate startup slowness, this task would take 3 minutes to run a simple `bundle exec rails db:migrate`. + +To resolve this, we used `git` and [Gitlab environments](https://docs.gitlab.com/ee/api/environments.html#get-a-specific-environment) to look for modified files in the `db/migrate` directory. If none were found, we would skip running the migration task. Since majority of our deployments don't run database migration tasks, this shaved off 3 minutes from most jobs. +``` +env_json=$(curl --silent --header "PRIVATE-TOKEN: " "/environments/") +last_deployment_sha=$(echo $env_json | jq -r '.last_deployment.sha') +git diff --name-only $CI_COMMIT_SHA $last_deployment_sha | grep db/migrate +``` + +#### Other things to look for +If you run sidecar containers like Datadog, make sure that you're providing enough memory and CPU to those containers to avoid waiting on them to be ready while your main container has already started. + + +We hope this helps you speed up your deployments and gain greater efficiency! diff --git a/_posts/2021-04-12-embedding-based-retrieval-scribd.md b/_posts/2021-04-12-embedding-based-retrieval-scribd.md new file mode 100644 index 0000000..e31f210 --- /dev/null +++ b/_posts/2021-04-12-embedding-based-retrieval-scribd.md @@ -0,0 +1,288 @@ +--- +layout: post +title: "Embedding-based Retrieval at Scribd" +author: div +tags: +- machinelearning +- real-time +- search +- featured +team: Recommendations +--- + +Building recommendations systems like those implemented at large companies like +[Facebook](https://arxiv.org/pdf/2006.11632.pdf) and +[Pinterest](https://labs.pinterest.com/user/themes/pin_labs/assets/paper/pintext-kdd2019.pdf) +can be accomplished using off the shelf tools like Elasticsearch. Many modern recommendation systems implement +*embedding-based retrieval*, a technique that uses embeddings to represent documents, and then converts the +recommendations retrieval problem into a [similarity search](https://en.wikipedia.org/wiki/Similarity_search) problem +in the embedding space. This post details our approach to “embedding-based retrieval” with Elasticsearch. + +### Context +Recommendations plays an integral part in helping users discover content that delights them on the Scribd platform, +which hosts millions of premium ebooks, audiobooks, etc along with over a hundred million user uploaded items. + +![](/post-images/2021-04-ebr-scribd/f1.png) + +*Figure One: An example of a row on Scribd’s home page that is generated by the recommendations service* + +Currently, Scribd uses a collaborative filtering based approach to recommend content, but this model limits our ability +to personalize recommendations for each user. This is our primary motivation for rethinking the way we recommend content, +and has resulted in us shifting to [Transformer](http://jalammar.github.io/illustrated-transformer/) -based sequential +recommendations. While model architecture and details won’t be discussed in this post, the key takeaway is that our +implementation outputs *embeddings* – vector representations of items and users that capture semantic information such +as the genre of an audiobook or the reading preferences of a user. Thus, the challenge is now how to utilize these +millions of embeddings to serve recommendations in an online, reliable, and low-latency manner to users as they +use Scribd. We built an embedding-based retrieval system to solve this use case. + +### Recommendations as a Faceted Search Problem +There are many technologies capable of performing fast, reliable nearest neighbors search across a large number of +document vectors. However, our system has the additional challenge of requiring support for +[faceted search](https://en.wikipedia.org/wiki/Faceted_search) – that is, being able to retrieve the most relevant +documents over a subset of the corpus defined by user-specific business rules (e.g. language of the item or geographic +availability) at query time. At a high level, we desired a system capable of fulfilling the following requirements: + +1. The system should be able to prefilter results over one or more given facets. This facet can be defined as a filter +over numerical, string, or category fields +2. The system should support one or more exact distance metrics (e.g. dot product, euclidean distance) +3. The system should allow updates to data without downtime +4. The system should be highly available, and be able to respond to a query quickly. We targeted a service-level +objective (SLO) with p95 of <100ms +5. The system should have helpful monitoring and alerting capabilities, or provide support for external solutions + +After evaluating several candidates for this system, we found Elasticsearch to be the most suitable for our use case. +In addition to satisfying all the requirements above, it has the following benefits: + +- Widely used, has a large community, and thorough documentation which allows easier long-term maintenance and onboarding +- Updating schemas can easily be automated using pre-specified templates, which makes ingesting new data and maintaining +indices a breeze +- Supports custom plugin integrations + +However, Elasticsearch also has some drawbacks, the most notable of which is the lack of true in-memory partial updates. +This is a dealbreaker if updates to the system happen frequently and in real-time, but our use case only requires support +for nightly batch updates, so this is a tradeoff we are willing to accept. + +We also looked into a few other systems as potential solutions. While +[Open Distro for Elasticsearch](https://opendistro.github.io/for-elasticsearch/) (aka AWS Managed Elasticsearch) was +originally considered due to its simplicity in deployment and maintenance, we decided not to move forward with this +solution due to its lack of support for prefiltering. [Vespa](https://vespa.ai/) is also a promising candidate that has a +bunch of additional useful features, such as true in-memory partial updates, and support for integration with TensorFlow +for advanced, ML-based ranking. The reason we did not proceed with Vespa was due to maintenance concerns: deploying to +multiple nodes is challenging since EKS support is lacking and documentation is sparse. Additionally, Vespa requires the +entire application package containing all indices and their schemas to be deployed at once, which makes working in a +distributed fashion (i.e. working with teammates and using a VCS) challenging. + +### How to Set Up Elasticsearch as a Faceted Search Solution + +![](/post-images/2021-04-ebr-scribd/f2.png) + +*Figure Two: A high level diagram illustrating how the Elasticsearch system fetches recommendations* + +Elasticsearch stores data as JSON documents within indices, which are logical namespaces with data mappings and shard +configurations. For our use case, we defined two indices, a `user_index` and an `item_index`. The former is essentially +a key-value store that maps a user ID to a corresponding user embedding. A sample document in the `user_index` looks like: + +``` +{"_id": 4243913, + "user_embed": [-0.5888184, ..., -0.3882332]} +``` + +Notice here we use Elasticsearch’s inbuilt `_id` field rather than creating a custom field. This is so we can fetch user +embeddings with a `GET` request rather than having to search for them, like this: + +``` +curl :9200/user_index/_doc/4243913 +``` + +Now that we have the user embedding, we can use it to query the `item_index`, which stores each item’s metadata +(which we will use to perform faceted search) and embedding. Here’s what a document in this index could look like: + +``` +{"_id": 13375, + "item_format": "audiobook", + "language": "english", + "country": "Australia", + "categories": ["comedy", "fiction", "adventure"], + "item_embed": [0.51400936,...,0.0892048]} +``` + +We want to accomplish two goals in our query: retrieving the most relevant documents to the user (which in our model +corresponds to the dot product between the user and item embeddings), and ensuring that all retrieved documents have the +same filter values as those requested by the user. This is where Elasticsearch shines: + +``` +curl -H 'Content-Type: application/json' \ +:9200/item_index/_search \ +-d \ +' +{"_source": ["_id"], + "size": 30, + "query": {"script_score": {"query": {"bool": + {"must_not": {"term": {"categories": "adventure"}}, + "filter": [{"term": {"language": "english"}}, + {"term": {"country": "Australia"}}]}}, + "script": {"source": "double value = dotProduct(params.user_embed, 'item_embed'); + return Math.max(0, value+10000);", + "params": {"user_embed": [-0.5888184, ..., -0.3882332]}}}}} +' +``` + +Let’s break this query down to understand what’s going on: + +1. Line 2: Here we are querying the `item_index` using Elasticsearch’s `_search` API +2. Lines 5,6: We specify which attributes of the item documents we’d like returned (in this case, only `_id`), and how many +results (`30`) +3. Line 7: Here we are querying using the `script_score` feature; this is what allows us to first prefilter our corpus and +then rank the remaining subset using a custom script +4. Lines 8-10: Elasticsearch has various different boolean query types for filtering. In this example we specify that we +are interested only in `english` items that can be viewed in `Australia` and which are not categorized as `adventure` +5. Lines 11-12: Here is where we get to define our custom script. Elasticsearch has a built-in `dot_product` method we +can employ, which is optimized to speed up computation. Note that our embeddings are not normalized, and Elasticsearch +prohibits negative scores. For this reason, we had to include the score transformation in line 12 to ensure our scores +were positive +6. Line 13: Here we can add parameters which are passed to the scoring script + +This query will retrieve recommendations based on one set of filters. However, in addition to user filters, each row on +Scribd’s homepage also has row-specific filters (for example, the row “Audiobooks Recommended for You” would have a +row-specific filter of `"item_format": "audiobook"`). Rather than making multiple queries to the Elasticsearch cluster +with each combination of user and row filters, we can conduct multiple independent searches in a single query using the +`_msearch` API. The following example query generates recommendations for hypothetical “Audiobooks Recommended for You” +and “Comedy Titles Recommended for You” rows: + +``` +curl -H 'Content-Type: application/json' \ +:9200/_msearch \ +-d \ +' +{"index": "item_index"} +{"_source": ["_id"], + "size": 30, + "query": {"script_score": {"query": {"bool": + {"must_not": {"term": {"categories": "adventure"}}, + "filter": [{"term": {"language": "english"}}, + {"term": {"item_format": "audiobook"}}, + {"term": {"country": "Australia"}}]}}, + "script": {"source": "double value = dotProduct(params.user_embed, 'item_embed'); + return Math.max(0, value+10000);", + "params": {"user_embed": [-0.5888184, ..., -0.3882332]}}}}} +{"index": "item_index"} +{"_source": ["_id"], + "size": 30, + "query": {"script_score": {"query": {"bool": + {"must_not": {"term": {"categories": "adventure"}}, + "filter": [{"term": {"language": "english"}}, + {"term": {"categories": "comedy"}}, + {"term": {"country": "Australia"}}]}}, + "script": {"source": "double value = dotProduct(params.user_embed, 'item_embed'); + return Math.max(0, value+10000);", + "params": {"user_embed": [-0.5888184, ..., -0.3882332]}}}}} +' +``` + +#### Shard Configuration +Elasticsearch stores multiple copies of data across multiple nodes for resilience and increased query performance in a +process known as sharding. The number of primary and replica shards is configurable only at index creation time. Here are +some things to consider regarding shards: + +1. Try out various shard configurations to see what works best for each use case. +[Elastic](https://www.elastic.co/blog/how-many-shards-should-i-have-in-my-elasticsearch-cluster) recommends 20-40GB of data per +shard, while [eBay](https://tech.ebayinc.com/engineering/elasticsearch-performance-tuning-practice-at-ebay/) likes to keep +their shard size below 30GB. However, these values did not work for us, and we found much smaller shard sizes (<5GB) to +boost performance in the form of reduced latency at query time. +2. When updating data, do not update documents within the existing index. Instead, create a new index, ingest updated +documents into this index, and [re-alias](https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-aliases.html) +from the old index to the new one. This process will allow you to retain older data in case an update needs to be reverted, +and allows re-configurability of shards at update time. + +#### Cluster Configuration +We deployed our cluster across multiple data and primary nodes to enjoy the benefits of data redundancy, increased +availability, and the ability to scale horizontally as our service grows. We found that deploying the cluster across +multiple availability zones results in an increased latency during query time, but this is a tradeoff we accepted in the +interest of availability. + +As for hardware specifics, we use AWS EC2 instances to host our cluster. In production, we have 3 `t3a.small` primary-only +nodes, 3 `c5d.12xlarge` data nodes, and 1 `t3.micro` Kibana node. The primary-only nodes are utilized only in a coordinating +role (to route requests, distribute bulk indexing, etc), essentially acting as smart load balancers. This is why these +nodes are much smaller than the data nodes, which handle the bulk of storage and computational costs. Kibana is a data +visualization and monitoring tool; however, in production we use Datadog for our monitoring and alerting responsibilities, +which is why we do not allocate many resources for the Kibana node. + +### What Generating Recommendations Looks Like + +![](/post-images/2021-04-ebr-scribd/f3.png) + +*Figure Three: a diagram illustrating the system design for Personalization’s Embedding-based Retrieval Service* + +Step by step, this is how recommendations are generated for a user when he/she requests the home page: +1. The Scribd app passes the user’s information to the recommendations service +2. The recommendations service queries Elasticsearch with the user’s ID to retrieve their user embedding, which is stored +in a user index +3. The recommendations service once again queries Elasticsearch, this time with the user’s embedding along with their +user query filters. This query is a multi-search request to the item index: one for every desired row +4. Elasticsearch returns these recommendations to the service, which are postprocessed and generated into rows before +being sent to the client +5. The client renders these recommendations and displays them to the user + +With this approach, Elasticsearch will serve two purposes: acting as a key-value store, and retrieving recommendations. +Elasticsearch is, of course, a slower key-value store than traditional databases, but we found the increase in latency +to be insignificant (~5ms) for our use case. Furthermore, the benefit of this approach is that it only requires +maintaining data in one system; using multiple systems to store data would create a consistency challenge. + +The underlying Personalization model is very large, making retraining it a very expensive process. Thus, it needs to be +retrained often enough to account for factors like user preference drift but not too often so as to be efficient with +computational resources. We found retraining the model weekly worked well for us. item embeddings, which typically update +only incrementally, are also recomputed weekly. However, user embeddings are recomputed daily to provide fresh +recommendations based on changing user interests. These embeddings along with relevant metadata are ingested into the +Elasticsearch index in a batch process using [Apache Spark](https://spark.apache.org/) and are scheduled through +[Apache Airflow](https://airflow.apache.org/). We monitor this ingest process along with real-time serving metrics +through Datadog. + +#### Load Testing +Our primary goal during load testing was to ensure that our system was able to reliably respond to a “reasonably large” +number of requests per second and deliver a sufficient number of relevant recommendations, even under the confines of +multiple facets within each query. We also took this opportunity to experiment with various aspects of our system to +understand their impact on performance. These include: +- Shard and replica configuration: We found that increasing the number of shards increased performance, but only to a +point; If a cluster is over-sharded, the overhead of each shard outweighs the marginal performance gain of the +additional partition +- Dataset size: We artificially increased the size of our corpus several times to ensure the system’s performance would remain +sufficient even as our catalog continues to grow +- Filter and mapping configurations: Some filters (like `range` inequalities) are more expensive than traditional +categorical filters. Additionally, increasing the number of fields in each document also has a negative impact on latency. +Our use case calls for several filters across hundreds of document fields, so we played with several document and query +configuration to find the one most optimal for the performance of our system + +Our system is currently deployed to production and serves ~50rps with a p95 latency <60ms. + +### Results +Using Scribd’s internal A/B testing platform, we conducted an experiment comparing the existing recommendations service +with the new personalization model with embedding-based retrieval architecture across the home and discover page surfaces. +The test ran for approximately a month with >1M Scribd users (trialers or subscribers) assigned as participants. After +careful analysis of results, we saw the following statistically significant (p<0.01) improvements in the personalization +variant compared to the control experience: +- Increase in the number of users who clicked on a recommended item +- Increase in the average number of clicks per user +- Increase in the number of users with a read time of at least 10 minutes (in a three day window) + +These increases represent significant business impact on key performance metrics. The personalization model currently +generates recommendations for every (signed in) Scribd user’s home and discover pages. + +### Next Steps +Now that the infrastructure and model are in place, we are looking to add a slew of improvements to the existing system. +Our immediate efforts will focus on expanding the scope of this system to include more surfaces and row modules within +the Scribd experience. Additional long term projects include the addition of an online contextual reranker to increase +the relevance and freshness of recommendations and potentially integrating our system with an infrastructure as code +tool to more easily manage and scale compute resources. + +Thank you for reading! We hope you found this post useful and informative. + +### Thank You 🌮 +Thank you to +[Snehal Mistry](https://www.linkedin.com/in/snehal-mistry-b986b53/), +[Jeffrey Nguyen](https://www.linkedin.com/in/jnguyenfc/), +[Natalie Medvedchuk](https://www.linkedin.com/in/natalie-medvedchuk/), +[Dimitri Theoharatos](https://www.linkedin.com/in/dimitri-theoharatos/), +[Adrian Lienhard](https://www.linkedin.com/in/adrianlienhard/), +and countless others, all of whom provided invaluable guidance and assistance throughout this project. + +(giving tacos 🌮 is how we show appreciation here at Scribd) diff --git a/_posts/2021-04-26-integrating-airflow-and-okta.md b/_posts/2021-04-26-integrating-airflow-and-okta.md new file mode 100644 index 0000000..b9d19cb --- /dev/null +++ b/_posts/2021-04-26-integrating-airflow-and-okta.md @@ -0,0 +1,122 @@ +--- +layout: post +title: "Integrating Airflow with Okta" +author: kuntalb +tags: +- okta +- airflow +- featured +team: Core Platform +--- + + +At Scribd we use Airflow as a scheduler for most of our batch workloads, +this blog is not about Airflow so we are not getting into why Airflow. +This is about one of the biggest challenge that we faced while using Airflow and finally conquer. +That is how to do authentication and authorisation for Airflow. +Of course Airflow does support LDAP and at Scribd we started using LDAP with Airflow initially, +but as the organisation grow and more and more user started using Airflow, +it became imperative that we integrate Airflow with our SSO provider that is Okta. + +Sadly there is a lack of resources on how to implement airflow with Okta specifically. +This write up will describe the journey of integrating Airflow with Okta from the earlier LDAP setup. + + +## Prerequisite +This section will describe the minimum setup that will require to enable this integration. +1. Okta with [API Access Management](https://developer.okta.com/docs/concepts/api-access-management/) enabled. +Without this feature enabled in OKTA we will not be able to integrate Airflow with Okta + +We are going to use Flask app builder along with some additional packages to integrate it via Okta. +In Scribd we use a custom build docker image for Airflow, we install the following libraries in that docker image to make Airflow integration work with Okta +1. [Flask-AppBuilder + 3.2.2](https://github.com/dpgaspar/Flask-AppBuilder/tree/v3.2.2). Official + Airflow repo has a + [constraint](https://github.com/apache/airflow/blob/master/setup.cfg#L97) on + `flask-appbuilder~=3.1,>=3.1.1`, so adding this additionally to the docker image helps us bypass that constraint +1. `sqlalchemy>=1.3.18, <1.4.0` --> This is because of some python dependency for Flask-AppBuilder +1. `authlib==0.15.3` --> authlib needs to installed along with Airflow to enable flask-appbuilder integration with Okta via OIDC + +## Okta Setup + +![Sample Okta Setup](/post-images/2021-04-okta-airflow/sample-okta-setup.png) +
Sample Okta Setup
+ +1. Create an OIDC Web application. Give it a name and leave the values under the “Configure OpenID Connect” section empty. +1. Make note of the Client ID and the Client Secret, as you will need them for configuring the airflow webserver. +1. In the “Allowed Grant Types” section, make sure you check all of the boxes. +1. For the Login redirect URIs field, you will enter: `https://your-airflow-url-goes-here.com/oauth-authorized/okta` +1. For the Initiate login URI field, you will enter: `https://your-airflow-url-goes-here.com/login` + +## Airflow Configuration + +`conf/webserver_config.py` + + AUTH_TYPE = AUTH_OAUTH + OAUTH_PROVIDERS = [ + {'name': 'okta', 'icon': 'fa-circle-o', + 'token_key': 'access_token', + 'remote_app': { + 'client_id': <<>>, + 'client_secret': <<>>, + 'api_base_url': 'https://<>/oauth2/v1/', + 'client_kwargs': { + 'scope': 'openid profile email groups' + }, + 'access_token_url': 'https://<>/oauth2/v1/token', + 'authorize_url': 'https://<>/oauth2/v1/authorize', + } + } + ] + +A special thanks to Greg Reznik for handling everything related to Okta configuration + +### Special Steps + +1. We started with Flask-AppBuilder 3.2.1, however it had a bug that needs to + be fixed, we raised a [PR for Flask-AppBuilder](https://github.com/dpgaspar/Flask-AppBuilder/pull/1589) to resolve that issue. That PR got merged and now we can use the new release, Flask-AppBuilder 3.2.2 + +2. As we were migrating from LDAP, we will already have user info populated, + however Okta generates a new user id something like + this `okta_00u1046sqzJprt1hZ4x6`, but as the email id corresponding to that + user id is already present we got the below error. To prevent this we logged + into the underlying database for Airflow and cleaned up the `ab_user` and + `ab_user_role` table and let Okta integration recreate the user during first + sign up. + + ``` + [2021-03-19 16:32:28,559] {manager.py:215} ERROR - Error adding new user to database. (sqlite3.IntegrityError) UNIQUE constraint failed: ab_user.email + [SQL: INSERT INTO ab_user (first_name, last_name, username, password, active, email, last_login, login_count, fail_login_count, created_on, changed_on, created_by_fk, changed_by_fk) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)] + [2021-03-19 16:32:28,560] {manager.py:1321} ERROR - Error creating a new OAuth user okta_00u1046sqzJprt1hZ4x6 + ``` +3. Because we have deleted all the existing user and role, once the users logged in for the first time, + especially for the first admin user we did the following from the airflow cli. + This will create the first admin user after that if needed we can propagate other user and roles from the Airflow web console from this admin user account. + ``` + airflow users add-role -r Admin -u okta_00u1046sqzJprt1hZ4x6 + ``` + +## Known Issue + +1. Currently in the audit log, any action triggered on Airflow has Okta user id. Airflow needs to be patched to write out audit log entries with human readable user identifiers instead. + +## Final Stage + +Once the setup is complete, you will find the similar tiles on your okta dashboard, + +![Sample Okta Tiles](/post-images/2021-04-okta-airflow/okta-tiles.png) +
Sample Okta Tiles
+ +Once you select the tiles, it should redirect you to the below page + +![Sample Okta Login Page](/post-images/2021-04-okta-airflow/airflow-login.png) +
Okta Login Page
+ +Hope this doc will help you integrating Okta with Airflow, This journey was a bit tricky one for us but we finally make it happen and we do hope that this doc will help a lot of folks to integrate Airflow with Okta successfully. + +--- + +Within Scribd's Platform Engineering group we have a *lot* more services than +people, so we're always trying to find new ways to automate our infrastructure. +If you're interested in helping to build out scalable data platform to help +change the world reads, [come join us!](/careers/#open-positions) diff --git a/_posts/2021-05-04-backing-up-data-warehouse.md b/_posts/2021-05-04-backing-up-data-warehouse.md new file mode 100644 index 0000000..a4af54b --- /dev/null +++ b/_posts/2021-05-04-backing-up-data-warehouse.md @@ -0,0 +1,146 @@ +--- +layout: post +title: "Backing up Delta Lake" +author: kuntalb +tags: +- deltalake +- s3 +- data-warehouse +- backup +- featured +team: Core Platform +--- + + +Transitioning from a more traditional database operation (read ACID, RDBMS blah blah) background to a newer data platform is always interesting. As it constantly challenges all yours year old wisdom and kind of forces you to adapt to newer way of getting things done. + +At [Scribd](https://tech.scribd.com/) we have made +[Delta Lake](https://delta.io/) a cornerstone of our data platform. All data in +Delta Lake is stored in [Apache Parquet](https://parquet.apache.org/) format enabling Delta Lake to leverage +the efficient compression and encoding schemes that are native to Parquet. The +Delta Lake transaction log (also known as the `DeltaLog`) is an ordered record of +every transaction that has ever been performed on a Delta Lake table since its +inception. So a particular dataset to work properly it needs to have the +parquet file and the corresponding `DeltaLog`. + +When the task of having a workable backup of all those delta lake files fell +into my lap, I decided to look some of the age old concepts of backup in a new +perspective. The concerns I considered were: + + 1. What am I protecting against? How much I need to protect? + 1. Can I survive with loosing some data during restore and do I have the option of rebuilding them again from that point of time recovery? + 1. What kind of protection I want to put in place for the backed up data? + +So what we set as objective as: + + 1. I am mainly protecting against human error where by mistake a table can be purged ([VACUUM](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-vacuum.html)), which severely hamper my ability to do a time travel if required. + 1. In most cases, if we have a reasonable backup ready we should be able to build the Delta table that got lost between the time the backup was taken and a drop table has occurred. + + +## The Devil is in the Details + +After deliberating a lot, we decided to do this whole backup operation +independent of [Delta Lake](https://delta.io/) and go to the lowest layer +possible, in our case which was S3. I never thought I would say this ever in my +life (being a RDBMS DBA) but the moment we get onto S3 layer, the whole thing +become a challenge of copying few S3 buckets (read millions of files) over +instead of a database backup. +So we started looking for an efficient S3 copy operation and found [AWS S3 +batch +operation](https://docs.aws.amazon.com/AmazonS3/latest/userguide/batch-ops-examples-xcopy.html) +and its feature for Copying objects across AWS account. This was like match +made in heaven for us. +You can use [AWS S3 batch operation](https://docs.aws.amazon.com/AmazonS3/latest/userguide/batch-ops-examples-xcopy.html) to perform large-scale batch operations on Amazon S3 objects. S3 Batch Operations can perform a single operation on lists of Amazon S3 objects that you specify. A single job can perform a specified operation (in our case copy) on billions of objects containing large set of data. This operation has following features, + + 1. Automatically tracks progress. + 1. Stores a detailed completion report of all or selected actions in a user defined bucket. + 1. Provides a fully managed, auditable, and serverless experience. + +Once we decided to use [AWS S3 batch operation](https://docs.aws.amazon.com/AmazonS3/latest/userguide/batch-ops-examples-xcopy.html), the next biggest challenge was how to generate the inventory list that will feed the AWS S3 batch operation. We decided to use [AWS S3 inventory](https://docs.aws.amazon.com/AmazonS3/latest/userguide/storage-inventory.html) to generate the inventory list. There are some challenges associated with that as well. + +**Pros**: + +* Simple setup, we can terraform it easily +* Much more efficient operation compare to generating our list as that list object API only returns 1000 rows per call that means we have to keep iterating till we get the full list. + +**Cons**: + +* We do not control when it can be run, it will generate a report on a daily basis but the timings is not in our hand. +* It runs in an eventually consistent model, i.e. All of your objects might not appear in each inventory list. The inventory list provides eventual consistency for PUTs for both new objects and overwrites, and DELETEs. Inventory lists are a rolling snapshot of bucket items, which are eventually consistent (that is, the list might not include recently added or deleted objects) + +To overcome the downsides, we decided to run the backup at a later date, e.g. for a backup of March 31st we based that off a manifest generated on April 2nd. This manifest would certainly have all data up until March 31st and some of April 1st's files as well. + +Once we have settled on this model, the rest of the work was similar to any +other backup process. We also set up the Source and the Destination to have +protective boundaries so that we don't accidentally propogate any deletes to +the backups. + +### New Account New Beginning + +To stop this accidental deletion of the backed up data we decided to put the +backed up data set in completely separate bucket in a different AWS account +with stringent access controls in place. With the new account it was much easier to +control the access level from the beginning rather than controlling access in +an already existing account where people already have certain degree of access +and hard to modify that access levels. In the new account we ensured only a few handful of people will actually have +access to backed up data, further reducing chances of any manual error. + +### Backup Process + +#### Destination Side + + 1. Backup will be taken on a complete separate AWS account from the source + account. Hardly few admin will have access to this account to reduce the + chance of manual mistake. + 1. The whole backup process will be automated with less human intervention to reduce the scope of manual error. + 1. Destination Side we will have to create buckets to store the inventory reports based on which the batch job will be run. + 1. Destination Side we will have to create buckets to store the actual backup + where the batch job will store the backup objects. While terraforming it + we have that bucket name dynamically created with the date appended at the + end of the bucket name e.g. `-`, so that + before each full snapshot we can create this buckets. Otherwise there is a + risk of earlier full snapshots getting overwritten. + 1. Create an IAM role for the batch operation, source will give the copy object permission to this role + 1. We created a lambda on the destination side to scan through all the `manifest.json` files and create the actual batch operation and run it automatically. + +#### Source Side + + 1. We terraformed an Inventory Management config for all the buckets listed above in Source side. + 1. This inventory config will create the inventory in Designated Manifest bucket in the destination account. + 1. For all the buckets on the source side , we have to add the policy as a bucket level policy to allow the S3 batch operation role created in destination side to do the copy operation + + +### Limitations + +These are mainly the limitation of AWS S3 batch operation, + 1. All source objects must be in one bucket. + - This is not a challenge for us as we are going to invoke bucket level copy and create a manifest at bucket level meet this requirement + 1. All destination objects must be in one bucket. + - This is not a challenge for us as we are going to invoke bucket level copy and create a manifest at bucket level meet this requirement + 1. You must have read permissions for the source bucket and write permissions for the destination bucket. + - Again with proper IAM roles for the S3 batch copy operation can manage this + 1. Objects to be copied can be up to 5 GB in size. + - S3 Batch is using put method so its limited up to 5GB. If there is any manual upload of files that is more than 5GB we will skip it. The behaviour is tested and we found that batch operation is throwing the following error and continue with the rest of the operation. + ```Some-file-name,,failed,400,InvalidRequest,The specified copy source is larger than the maximum allowable size for a copy source: 5368709120 (Service: Amazon S3; Status Code: 400; Error Code: InvalidRequest; Request ID: FHNW4MF5ZMKBPDQY; S3 Extended Request ID: /uopiITqnCRtR1/W3K6DpeWTiJM36T/14azeNw4q2gBM0yj+r0GwzhmmHAsEMkhNq9v8NK4rcT8=; Proxy: null)``` + + 1. Copy jobs must be created in the destination region, which is the region you intend to copy the objects to. + - Again for our purpose this is what we intended to do any way + 1. If the buckets are un-versioned, you will overwrite objects with the same key names. + - We will create new buckets for each full snapshots to mitigate this. + +## Conclusion + +The above approach worked well for our purpose, and if we follow the process +properly it should suffice for many of our use-cases. This approach can work quite well if like us you do not have +the luxury of doing a "Stop the World" on your data warehouse writes, and still +need to have a backup with certain degree of confidence. This method does not +provide an accurate point on time snapshot due to the “eventually consistent” +model of manifest generation, but I believe this method covers most of the use-cases for +any Delta Lake backups. + +--- + +Within Scribd's Platform Engineering group we have a *lot* more services than +people, so we're always trying to find new ways to automate our infrastructure. +If you're interested in helping to build out scalable data platform to help +change the world reads, [come join us!](/careers/#open-positions) diff --git a/_posts/2021-05-18-growing-the-delta-lake-ecosystem.md b/_posts/2021-05-18-growing-the-delta-lake-ecosystem.md new file mode 100644 index 0000000..4479cbc --- /dev/null +++ b/_posts/2021-05-18-growing-the-delta-lake-ecosystem.md @@ -0,0 +1,99 @@ +--- +layout: post +title: "Growing the Delta Lake ecosystem with Rust and Python" +tags: +- featured +- rust +- deltalake +- python +author: rtyler +team: Core Platform +--- + + +Scribd stores billions of records in [Delta Lake](https://delta.io) but writing +or reading that data had been constrained to a single tech stack, all of that +changed with the creation of [delta-rs](https://github.com/delta-io/delta-rs). +Historically using Delta Lake required applications to be implemented with or +accompanied by [Apache Spark](https://spark.apache.org). Many of our batch +and streaming data processing applications are all Spark-based, but that's not +everything that exists! In mid-2020 it became clear that Delta Lake would be a +powerful tool in areas adjacent to the domain that Spark occupies. From my +perspective, I figured that would soon need to bring data into and out of Delta +Lake in dozens of different ways. Some discussions and prototyping led to the +creation of "delta-rs", a Delta Lake client written in Rust that can be easily +embedded in other languages such as +[Python](https://delta-io.github.io/delta-rs/python), Ruby, NodeJS, and more. + + +The [Delta Lake +protocol](https://github.com/delta-io/delta/blob/master/PROTOCOL.md) is not +_that_ complicated as it turns out. At an extremely high level, Delta Lake is a +JSON-based transaction log coupled with [Apache +Parquet](https://parquet.apache.org) files stored on disk/object storage. This means the core implementation of Delta in [Rust](https://rust-lang.org) is similarly quite simple. Take the following example from our integration tests which "opens" a table, reads it's transaction log and provides a list of Parquet files contained within: + + +```rust +let table = deltalake::open_table("./tests/data/delta-0.2.0") + .await + .unwrap(); +assert_eq!( + table.get_files(), + vec![ + "part-00000-cb6b150b-30b8-4662-ad28-ff32ddab96d2-c000.snappy.parquet", + "part-00000-7c2deba3-1994-4fb8-bc07-d46c948aa415-c000.snappy.parquet", + "part-00001-c373a5bd-85f0-4758-815e-7eb62007a15c-c000.snappy.parquet", + ] +); +``` + +Our primary motivation for delta-rs was to create something which would +accommodate high-throughput writes to Delta Lake and allow embedding for +languages like Python and Ruby such that users of those platforms could perform +light queries and read operations. + +The first notable writer-based application being co-developed with delta-rs is +[kafka-delta-ingest](https://github.com/delta-io/kafka-delta-ingest). The +project aims to provide a highly efficient daemon for ingesting +Kafka-originating data into Delta tables. In Scribd's stack, it will +effectively bridge JSON flowing into [Apache Kafka](https://kafka.apache.org) +topics into pre-defined Delta tables, translating a single JSON message into a +single row in the table. + +From the reader standpoint, the Python interface built on top of delta-rs, +contributed largely by [Florian Valeye](https://github.com/fvaleye) makes +working with Delta Lake even simpler, and for most architectures you only need +to run `pip install deltalake`: + +```python +from deltalake import DeltaTable +from pprint import pprint + +if __name__ == '__main__': + # Load the Delta Table + dt = DeltaTable('s3://delta/golden/data-reader-primitives') + + print(f'Table version: {dt.version()}') + + # List out all the files contained in the table + for f in dt.files(): + print(f' - {f}') + + # Create a Pandas dataframe to execute queries against the table + df = dt.to_pyarrow_table().to_pandas() + pprint(df.query('as_int % 2 == 1')) +``` + +I cannot stress enough how much potential the above Python snippet has for +machine learning and other Python-based applications at Scribd. For a number +of internal applications developers have been launching Spark clusters for the +sole purpose of reading some data from Delta Lake in order to start their model +training workloads in Python. With the maturation of the Python `deltalake` +package, now there is a fast and easy way to load Delta Lake into basic Python +applications. + + + +From my perspective, it's only the beginning with [delta-rs](https://github.com/delta-io/delta-rs). Delta Lake is a deceptively simple technology with tremendous potential across the data platform. I will be sharing more about delta-rs at [Data and AI Summit](https://databricks.com/dataaisummit/north-america-2021) on May 27th at 12:10 PDT. I hope you'll join [my session](https://databricks.com/speaker/r-tyler-croy) with your questions about delta-rs and where we're taking it! + + diff --git a/_posts/2021-05-19-kafka-delta-ingest.md b/_posts/2021-05-19-kafka-delta-ingest.md new file mode 100644 index 0000000..03bad83 --- /dev/null +++ b/_posts/2021-05-19-kafka-delta-ingest.md @@ -0,0 +1,195 @@ +--- +layout: post +title: "Kafka to Delta Lake, as fast as possible" +tags: +- featured +- rust +- deltalake +- kafka +author: christianw +team: Core Platform +--- + +Streaming data from Apache Kafka into Delta Lake is an integral part of +Scribd's data platform, but has been challenging to manage and +scale. We use Spark Structured Streaming jobs to read data from +Kafka topics and write that data into [Delta Lake](https://delta.io) tables. This approach gets the job +done but in production our experience has convinced us that a different +approach is necessary to efficiently bring data from Kafka to Delta Lake. To +serve this need, we created +[kafka-delta-ingest](https://github.com/delta-io/kafka-delta-ingest). + +The user requirements are likely relatable to a lot of folks: + +* _My application emits data into Kafka that I want to analyze later._ +* _I want my Kafka data to land in the data warehouse and be queryable pretty soon after ingestion._ + +Looking around the internet, there are few approaches people will blog about +but many would either cost too much, be really complicated to setup/maintain, +or both. Our first Spark-based attempt at solving this problem falls under +"both." + +Spark Structured Streaming is a powerful streaming framework that can easily +satisfy the requirements described above with a few lines of code (about 70 in +our case) but the cost profile is pretty high. Despite the relative simplicity +of the code, the cluster resources necessary are significant. Many of our +variable throughput Kafka topics leave us wishing for auto-scaling too. + +[Kafka Delta Ingest](https://github.com/delta-io/kafka-delta-ingest) is an open +source daemon created by Scribd in the [Delta Lake project](https://delta.io) +with the very specific goal of optimizing the path from Kafka to Delta Lake. By +focusing on this very specific use-case, we can remove many of the pain points +we currently experience with our Spark streaming jobs. The daemon is written in +[Rust](https://rust-lang.org) which has helped us keep the runtime super +efficient. It is also fully distributed with no coordination between workers, +meaning no driver node hanging out and a smaller overall infrastructure +footprint. + +## In depth + +There is a bit of an impedance mismatch between Kafka streams and data warehouse +file structure. [Parquet is a columnar +format](https://parquet.apache.org/documentation/latest/), and each Parquet +file (in fact each row group within a file) in a Delta Lake table should +include a lot of rows to enable queries to leverage all the neat optimization +features of parquet and run as fast as possible. Messages consumed from a Kafka +topic come in one at a time though. To bridge this mismatch, Kafka Delta Ingest +spends most of its time buffering messages in memory. It checks a few process +arguments to make the largest possible parquet files. Those arguments are: + +* allowed_latency - the latency allowed between each Delta write +* max_messages_per_batch - the maximum number of messages/records to include in each Parquet row group within a file +* min_bytes_per_file - the minimum bytes per parquet file written out by Kafka Delta Ingest + +Internally, our internal Kafka usage guidelines include these constraints: + +* Messages written to Kafka + * Must be JSON + * Must include an ISO 8601 timestamp representing when the message was ingested/created (field name is flexible, but this timestamp must be included somewhere in the message) + +* Records written to Delta Lake + * Must include Kafka metadata + * We preserve the metadata fields below under a struct field called `meta.kafka` + * topic + * partition + * offset + * timestamp + * Must include a date-based partition (e.g. yyyy-MM-dd) derived from the ISO 8601 ingestion timestamp of the message + +Other potential users of Kafka Delta Ingest may have different guidelines on how they use Kafka. Because of how we use Kafka internally, the first iteration of Kafka Delta Ingest is very focused on: + +* JSON formatted messages +* Buffer flush triggers that thread the needle between query performance and persistence latency +* Very basic message transformations to limit the message schema constraints we push up to our producer applications + +### Example + +Let's say we have an application that writes messages onto a Kafka topic called +`web_requests` every time it handles an HTTP request. The message schema +written by the producer application includes fields such as: + +* `status`: 200, 404, 500, 302, etc. +* `method`: `GET`, `POST`, etc. +* `url`: Requested URL, e.g. `/documents/42`, etc. +* `meta.producer.timestamp`: an ISO-8601 timestamp representing the time the producer wrote the message. + +Many of our tables are partitioned partitioned by a field called `date` which +has a `yyyy-MM-dd` format. We choose not to force our producer application to +provide this field explicitly. Instead, we will configure our Kafka Delta +Ingest stream to perform a transformation of the `meta.producer.timestamp` +field that the producer already intends to send. + +To accomplish this with Kafka Delta Ingest, using the "web_requests" stream as an example, we would: + +1. Create the "web_requests" topic +1. Create the schema for our Delta Lake table: + ``` +CREATE TABLE `kafka_delta_ingest`.`web_requests` ( + `meta` STRUCT< + `kafka`: STRUCT<`offset`: BIGINT, `topic`: STRING, `partition`: INT>, + `producer`: `timestamp` + >, + `method` STRING, + `status` INT, + `url` STRING, + `date` STRING + ) +USING delta +PARTITIONED BY (`date`) +LOCATION 's3://path_to_web_requests_delta_table' +``` + + The Delta Lake schema we create includes more fields than the producer + actually sends. Fields not written by the producer include the `meta.kafka` + struct and the `date` field. + +3. Launch one or more kafka-delta-ingest workers to handle the topic-to-table pipeline: + ``` +kafka-delta-ingest ingest web_requests s3://path_to_web_requests_delta_table \ + -l 60 \ + -K "auto.offset.reset=earliest" \ + -t 'date: substr(meta.producer.timestamp, `0`, `10`)' \ + 'meta.kafka.offset: kafka.offset' \ + 'meta.kafka.partition: kafka.partition' \ + 'meta.kafka.topic: kafka.topic' +``` + +The parameters passed to the daemon configure the allowed latency, some primitive data augmentation, the source topic, and the destination Delta table. For more detailed documentation, consult the [readme](https://github.com/delta-io/kafka-delta-ingest#readme). + +Internally, Kafka Delta Ingest relies on Kafka consumer groups to coordinate +partition assignment across many workers handling the same topic. If we +want to scale out the number of workers handling "web_requests" we can just +launch more ECS tasks with the same configuration and respond to Kafka's +rebalance events. + +The deployment ends up looking like: + +![Kafka Delta Ingest Deployment](/post-images/2021-05-kafka-delta-ingest/kafka-delta-ingest-deployment.png) + +We have one Kafka Delta Ingest ECS service per topic-to-table ETL workload. Each service runs 24x7. We expect high volume topics to require more worker nodes and scale out and in occassionally, and low volume topics to require a single worker (more on that later). + + +### 💙 + +My favorite thing about Kafka Delta Ingest is its very narrow scope to optimize +and replace a _very_ common use case that you _could_ support with Spark +Structured Streaming, but much less efficiently. Basically, I love that we are +creating a very specific tool for a very common need. + +Compare/contrast of Kafka Delta Ingest vs Spark Structured Streaming: + +* Kafka Delta Ingest *ONLY* supports Kafka as a source, whereas Spark Structured Streaming supports generic sources. +* Kafka Delta Ingest *ONLY* supports Delta Lake as a sink, whereas Spark Structured Streaming supports generic sinks. +* Kafka Delta Ingest *ONLY* supports JSON messages (so far), whereas Spark Structured Streaming supports a variety of formats. +* Unlike Spark Structured Streaming, Kafka Delta Ingest *DOES NOT* provide any facility for joining streams or computing aggregates. +* Kafka Delta Ingest is an application that makes strong assumptions about the source and sink and is only configurable via command line arguments, whereas Spark Structured Streaming is a library that you must write and compile code against to yield a jar that can then be hosted as a job. +* Kafka Delta Ingest is fully distributed and master-less - there is no "driver" node. Nodes can be spun up on a platform like ECS with little thought to coordination or special platform dependencies. A Spark Structured Streaming job must be launched on a platform like Databricks or EMR capable of running a Spark cluster. + +## Get Involved! + +Contributions to Kafka Delta Ingest are very welcome and encouraged. Our core team has been focused on supporting our internal use case so far, but we would love to see Kafka Delta Ingest grow into a more well rounded solution. We have not been using the [GitHub issue list](https://github.com/delta-io/kafka-delta-ingest/issues) for managing work just yet since we are mostly managing work internally until we have our primary workloads fully covered, but we will be paying much more attention to this channel in the very near future. + +One especially interesting area for contribution is related to data format. A +lot of folks are using Avro and Protobuf instead of JSON these days. We use +JSON on all of our ingestion streams at the moment, but I'd love to see Avro +and Protobuf support in Kafka Delta Ingest. + +Another big contribution would be support for running periodically +rather than continuously (24x7). I suspect a lot of folks have situations +where Kafka is used as a buffer between data warehouse writes that +occur periodically throughout the day. We have several low-volume topics that +are not a good fit for 24x7 streaming because they only produce one or two +messages per second. Having a 24x7 process buffer these topics in memory would +be very awkward. It would make a lot more sense to let these buffer in Kafka +and launch a periodic cron-style job to do the ETL a few times a day. This is +similar to the "Trigger Once" capability in [Spark Structured +Streaming](https://databricks.com/blog/2017/05/22/running-streaming-jobs-day-10x-cost-savings.html). + +Another vector for contribution is +[delta-rs](https://github.com/delta-io/delta-rs). Delta-rs is another Scribd +sponsored open source project and is a key dependency of kafka-delta-ingest. +Any write-oriented improvement accepted in delta-rs is Clikely to benefit +kafka-delta-ingest. + + +Kafka Delta Ingest has a bright future ahead and I hope you'll join us! diff --git a/_posts/2021-07-08-automate-databricks-with-terraform.md b/_posts/2021-07-08-automate-databricks-with-terraform.md new file mode 100644 index 0000000..f995b0c --- /dev/null +++ b/_posts/2021-07-08-automate-databricks-with-terraform.md @@ -0,0 +1,51 @@ +--- +layout: post +title: "Automating Databricks with Terraform" +team: Core Platform +author: rtyler +tags: +- databricks +- terraform +- featured +--- + +The long term success of our data platform relies on putting tools into the +hands of developers and data scientists to “choose their own adventure”. A big +part of that story has been [Databricks](https://databricks.com) which we +recently integrated with [Terraform](https://terraform.io) to make it easy to +scale a top-notch developer experience. At the 2021 Data and AI Summit, Core +Platform infrastructure engineer [Hamilton +Hord](https://github.com/HamiltonHord) and Databricks engineer [Serge +Smertin](https://github.com/nfx) presented on the Databricks terraform provider +and how it's been used by Scribd. + +In the session embedded below, they share the details on the [Databricks (Labs) +Terraform +integration](https://github.com/databrickslabs/terraform-provider-databricks) +and how it can automate literally every aspect required for a production-grade +platform: data security, permissions, continuous deployment and so on. They +also discuss the ways in which our Core Platform team enables internal +customers without acting as gatekeepers for data platform changes. Just about +anything they might need in Databricks is a pull request away! + +
+ +
+ + +In hindsight, it's mind-boggling how much manual configuration we had to +previously maintain. With the Terraform provider for Databricks we can very +easily test, reproduce, and audit hundreds of different business critical +Databricks resources. Coupling Terraform with the recent "multi-workspace" +support that Databricks unveiled in 2020 means we can also now provision an +entirely new environment in a few hours! + +Investing in data platform tools and automation is a key part of the vision for +Platform Engineering which encompasses Data Engineering, Data Operations, and +Core Platform. We have a [number of open positions](/careers/#open-positions) +at the moment, but I wanted to call special attention to the [Data Engineering +Manager](https://jobs.lever.co/scribd/e2187c1c-a1d6-4b77-bde6-acc997f68156) +role for which we're currently hiring. The leader of the Data Engineering team +will help deliver data tools and solutions for internal customers building on +top of Delta Lake, Databricks, Airflow, and Kafka. Suffice it to say, there's a +lot of really interesting work to be done! diff --git a/_posts/2021-07-12-identifying-document-types.md b/_posts/2021-07-12-identifying-document-types.md new file mode 100644 index 0000000..b8f8fed --- /dev/null +++ b/_posts/2021-07-12-identifying-document-types.md @@ -0,0 +1,131 @@ +--- +layout: post +title: "Identifying Document Types at Scribd" +tags: +- machinelearning +- data +- featured +- kyc-series +team: Applied Research +author: jonathanr +--- + + +User-uploaded documents have been a core component of Scribd’s business from +the very beginning, understanding what is _actually_ in the document corpus +unlocks exciting new opportunities for discovery and recommendation. +With Scribd anybody can [upload and share +documents](https://www.scribd.com/docs), analogous to YouTube and videos. Over +the years, our document corpus has become larger and more diverse which has +made understanding it an ever-increasing challenge. +Over the past year one of the missions of the Applied Research team has been to +extract key document metadata to enrich +downstream discovery systems. Our approach combines semantic understanding with +user behaviour in a multi-component machine learning system. + +This is part 1 in a series of blog posts explaining the challenges and +solutions explored while building this system. This post presents the +limitations, challenges, and solutions encountered when developing a model to +classify arbitrary user-uploaded documents. + + +## Initial Constraints + +The document corpus at Scribd stretches far and wide in terms of content, language and structure. An arbitrary document can be anything from math homework to Philippine law to engineering schematics. In the first stage of the document understanding system, we want to exploit visual cues in the documents. Any model used here must be language-agnostic to apply to arbitrary documents. This is analogous to a “first glance” from humans, where we can quickly distinguish a comic book from a business report without having to read any text. To satisfy these requirements, we use a computer vision model to predict the document type. But what is a “type”? + + + +## Identifying Document Types + +A necessary question to ask, but a difficult one to answer –  what kind of documents do we have? As mentioned in the section above, we’re interested in differentiating documents based on visual cues, such as text-heavy versus spreadsheet versus comics. We’re not yet interested in more granular information like fiction VS non-fiction. + +Our approach to this challenge was twofold. Firstly, talking to subject matter experts at Scribd on the kinds of documents they have seen in the corpus. This was and continues to be very informative, as they have domain-specific knowledge that we leverage with machine learning. The second solution was to use a data-driven method to explore documents. This consisted of creating embeddings for documents based on their usage. Clustering and plotting these embeddings on an interactive map allowed us to examine document structure in different clusters. Combining these two methods drove the definition of document types. Below is an example of one of these maps we used to explore the corpus. + + + + + +
+ Map of the document corpus, built from user-interaction embeddings +
Figure 1: Map of the document corpus built from user-interaction embeddings. More on this method in a future post.
+
+
+ +We converged on 6 document types, which included sheet-music, text-heavy, comics and tables. More importantly, these 6 classes don’t account for every single document in our corpus. While there are many different ways of dealing with out-of-distribution examples in the literature, our approach explicitly added an “other” class to the model and train it. We talk more about its intuition, potential solutions to the problem and challenges faced in the coming sections. + + +## Document Classification + +As mentioned in the introduction, we need an approach that is language and content agnostic, meaning that the same model will be appropriate for all documents, whether they contain images, text, or a combination of both. To satisfy these constraints we use a computer vision model to classify individual pages. These predictions can then be combined with other meta-data such as page count or word count to form a prediction for the entire document. + + +### Gathering Labelled Pages and Documents + +Before the model training started, we faced an interesting data gathering problem. Our goal is to classify documents, so we must gather labelled documents. However, in order to train the page classifier mentioned above, we must also gather labelled pages. Naively, it might seem appropriate to gather labelled documents and use the document label for each of its pages. This isn't appropriate as a single document can contain multiple types of pages. As an example, consider the pages in this document. + + +
+ Three pages from the same document +
Figure 2: Three different pages from the same document to demonstrate why we can't take the document label and assign it to each page.
+
+ + +The first and third pages can be considered text-heavy, but definitely not the second. Taking all the pages of this document and labelling them as text-heavy would severely pollute our training and testing data. The same logic applies to each of our 6 classes. + +To circumvent this challenge, we took an active learning approach to data gathering. We started with a small set of hand-labelled pages for each class and trained binary classifiers iteratively. The binary classification problem is simpler than the multi-class problem, requiring less hand-labelled data to obtain reliable results. At each iteration, we evaluated the most confident and least confident predictions of the model to get a sense of its inductive biases. Judging from these, we supplemented the training data for the next iteration to tweak the inductive biases and have confidence in the resulting model and labels. The sheet music class is a prime example of tweaking inductive biases. Below is an example of a page that can cause a sheet music misclassification if the model learns that sheet music is any page with horizontal lines. Supplementing the training data at each iteration helps get rid of inductive biases like this. + + +
+ Example of possible sheet music misclassification from wrong inductive bias +
Figure 3: Example of possible sheet music misclassification due to wrong inductive biases.
+
+ +After creating these binary classifiers for each class, we have a large set of reliable labels and classifiers that can be used to gather more data if necessary. + + +### Building a Page Classifier + +The page classification problem is very similar to ImageNet classification, so we can leverage pre-trained ImageNet models. We used transfer learning in [fast.ai](https://www.fast.ai/) and [PyTorch](https://pytorch.org/) to fine-tune pre-trained computer vision architectures for the page-classifier. After initial experiments, it was clear that models with very high ImageNet accuracy, such as EfficientNet, did not perform much better on our dataset. While it’s difficult to pinpoint exactly why this is the case, we believe it is because of the nature of the classification task, the page resolutions and our data. + +We found [SqueezeNet](https://arxiv.org/pdf/1602.07360.pdf), a relatively established lightweight architecture, to be the best balance between accuracy and inference time. Because models such as ResNets and DenseNets are so large, they take a lot of time to train and iterate on. However, SqueezeNet is an order of magnitude smaller than these models, which opens up more possibilities in our training scheme. Now we can train the entire model and are not limited to using the pre-trained architecture as a feature-extractor, which is the case for larger models. + + +
+ Figure 4: SqueezeNet architectures taken from the paper. Left: SqueezeNet; Middle: SqueezeNet with simple bypass; Right: SqueezeNet with complex bypass. +
Figure 4: SqueezeNet architectures taken from the paper. Left: SqueezeNet; Middle: SqueezeNet with simple bypass; Right: SqueezeNet with complex bypass.
+
+ + +Additionally, for this particular model, low inference time is key in order to run it on hundreds of millions of documents. Inference time is also directly tied to costs, so an optimal cost/benefit ratio would require significantly higher performance to justify higher processing time. + + +### Ensembled Pages for Document Classification + +We now have a model to classify document pages and need to use them to determine a prediction for documents and want to combine these classifications with additional meta-data, such as total page count, page dimensions, etc. However, our experiments here showed that a simple ensemble of the page classifications provided an extremely strong baseline that was difficult to beat with meta-data. + +To increase efficiency, we sample 4 pages from the document to ensemble. This way we don’t run into processing issues for documents with thousands of pages. This was chosen based on the performance of the classifier and the page distribution in the document corpus, which empirically verified our assumption that this sample size reasonable represents each document. + + +### Error Analysis and Overconfidence + +After error analysis of a large sample of documents from production, we found that some classes were returning overconfident but wrong predictions. This is a very interesting challenge and one that has seen an explosion of academic research recently. To elaborate, we found documents that were predicted wrongly with over 99% confidence scores. A major consequence of this is that it negates the effectiveness of setting a threshold on model output in order to increase precision. + +While there are different ways of dealing with this, our approach involved two steps. Firstly, we utilized the “other” class mentioned earlier. By adding many of these adversarial, out-of-distribution examples to the “other” class and re-training the model, we were able to quickly improve metrics without changing model architecture. Secondly, this affected some classes more than others. For these, individual binary classifiers were built to improve precision. + +### Where do we go from here? + +
+ Figure 5: Diagram of the overall document understanding system. The red box is what we talked about in this post +
Figure 5: Diagram of the overall document understanding system. The red box is what we talked about in this post
+
+ + +Now that we have a model to filter documents based on visual cues, we can build dedicated information extraction models for each document type – sheet music, text-heavy, comics, tables. This is exactly how we proceed from here, and we start with extracting information from text-heavy documents. + +[Part 2](/blog/2021/information-extraction-at-scribd.html) in this series will dive deeper into the challenges and solutions our +team encountered while building these models. If you're interested to learn more about the problems Applied Research is solving or the systems which are built around those solutions, check out [our open positions!](/careers/#open-positions) + + +## References + +- [SqueezeNet: AlexNet-Level Accuracy with 50X Fewer Parameters and <0.5MB Model Size](https://arxiv.org/pdf/1602.07360.pdf) diff --git a/_posts/2021-07-20-growing-delta-ecosystem-with-rust.md b/_posts/2021-07-20-growing-delta-ecosystem-with-rust.md new file mode 100644 index 0000000..d476d5e --- /dev/null +++ b/_posts/2021-07-20-growing-delta-ecosystem-with-rust.md @@ -0,0 +1,46 @@ +--- +layout: post +title: "Presenting Rust and Python Support for Delta Lake" +tags: +- deltalake +- databricks +- featured +- rust +author: rtyler +team: Core Platform +--- + +Delta Lake is integral to our data platform which is why we have invested +heavily in [delta-rs](https://github.com/delta-io/delta-rs) to support our +non-JVM Delta Lake needs. This year I had the opportunity to share the progress +of delta-rs at Data and AI Summit. Delta-rs was originally started by my colleague [QP](https://github.com/houqp) just over a year ago and it has now grown to now a multi-company project with numerous contributors, and downstream projects such as [kafka-delta-ingest](/blog/2021/kafka-delta-ingest.html). + + + +In the session embedded below, I introduce the delta-rs project which is +helping bring the power of Delta Lake outside of the Spark ecosystem. By +providing a foundational Delta Lake library in Rust, delta-rs can enable native +bindings in Python, Ruby, Golang, and more.We will review what functionality +delta-rs supports in its current Rust and Python APIs and the upcoming roadmap. + +I also try to give an overview of one of the first projects to use it in +production: +[kafka-delta-ingest](https://github.com/delta-io/kafka-delta-ingest), which +builds on delta-rs to provide a high throughput service to bring data from +Kafka into Delta Lake. + + +
+ +
+ + +Investing in data platform tools and automation is a key part of the vision for +Platform Engineering which encompasses Data Engineering, Data Operations, and +Core Platform. We have a [number of open positions](/careers/#open-positions) +at the moment including a position to work closely with me as [Data Engineering +Manager](https://jobs.lever.co/scribd/e2187c1c-a1d6-4b77-bde6-acc997f68156). +The leader of the Data Engineering team will help deliver data tools and +solutions for internal customers building on top of Delta Lake, Databricks, +Airflow, and Kafka. Suffice it to say, there's a lot of really interesting work +to be done! diff --git a/_posts/2021-07-21-information-extraction-at-scribd.md b/_posts/2021-07-21-information-extraction-at-scribd.md new file mode 100644 index 0000000..6708e45 --- /dev/null +++ b/_posts/2021-07-21-information-extraction-at-scribd.md @@ -0,0 +1,163 @@ +--- +layout: post +title: "Information Extraction at Scribd" +tags: +- machinelearning +- data +- featured +- kyc-series +team: Applied Research +authors: +- antoniam +- rafaelp +--- + +Extracting metadata from our documents is an important part of our discovery +and recommendation pipeline, but discerning useful and relevant details +from text-heavy user-uploaded documents can be challenging. This is +part 2 in a series of blog posts describing a multi-component machine learning +system the Applied Research team built to extract metadata from our documents in order to enrich downstream discovery models. In this post, we present the challenges and +limitations the team faced when building information extraction NLP models for Scribd's +text-heavy documents and how they were solved. + +As mentioned in [part 1](/blog/2021/identifying-document-types.html), we now have a way of identifying text-heavy documents. Having done that, we want to build dedicated models to deepen our semantic understanding of them. We do this by extracting keyphrases and entities. + +
+ Figure 1: Diagram of our multi-component machine learning system. +
Figure 1: Diagram of our multi-component machine learning system.
+
+ +Keyphrases are phrases that represent major themes/topics, whereas entities are proper nouns such as people, places and organizations. For example, when a user uploads a document about the Manhattan project, we will first detect it is text-heavy, then extract keyphrases and entities. Potential keyphrases would be “atomic bomb” and “nuclear weapons” and potential entities would be “Robert Oppenheimer” and “Los Alamos”. + +As keyphrase extraction brings out the general topics discussed in a document, it helps put a cap on the amount of information kept per document, resulting in a somewhat uniform representation of documents irrespective of their original size. Entity extraction, on the other hand, identifies elements in a text that aren't necessarily reflected by keyphrases only. We found the combination of keyphrase and entity extraction to provide a rich semantic description of each document. + +The rest of this post will explain how we approached keyphrase and entity extraction, and how we identified whether a subset of these keyphrases and entities are present in a knowledge base (also known as linking), and introduce how we use them to categorize documents. + +## Keyphrase Extraction + +Typically a keyphrase extraction system operates in two steps as indicated in this survey:  + +- Using heuristics to extract a list of words/phrases that serve as candidate keyphrases, such as part-of-speech language patterns, stopwords filtering, and n-grams with Wikipedia article titles + +- Determining which of these candidate keyphrases are most likely to be keyphrases, using one of the two approaches: + + - Supervised approaches such as binary classification of candidates (useful/not useful), structural features based on positional encoding, etc. + + - Unsupervised approaches such as selecting terms with the highest tf-idf and clustering. + +Training a decent supervised model to be able to extract keyphrases across a wide variety of topics would require a large amount of training data, and might generalize very poorly. For this reason, we decided to take the unsupervised approach. + +Our implementation of keyphrase extraction is optimized for speed without sacrificing keyphrase quality much. We employ both a statistical method and language specific rules to identify them efficiently. + +We simply start by filtering out stopwords and extracting the n-grams with a base n (bi-grams in our case, n=2). This step is fast and straightforward and results in an initial set of candidate n-grams.  + +Limiting the results to a single n-gram class, however, results in split keyphrases, which makes linking them to a knowledge base a challenging task. For that, we attempt to agglomerate lower order n-grams into potentially longer keyphrases, as long as they occur at a predetermined minimum frequency as compared to the shorter n-gram, based on the following a pattern:  + +`A sequence of nouns (NN) possibly interleaved with either Coordinating Conjunctions (CC) or Prepositions and Subordinating Conjunctions (IN).` + +Here are a few examples: + +- Assuming the minimum frequency of agglomeration is 0.5, that means we would only replace the bi-gram `world (NN) health (NN)` by `world (NN) health (NN) organization (NN)` as long as `world health organization` occurs at least 50% as much as `world health` occurs.  + +- Replace `Human (NNP) Development (NNP)` with `Center(NNP) for (IN) Global (NNP) Development (NNP)` only if the latter occurs at least a predetermined percentage of time as compared to the former. + +This method results in more coherent and complete keyphrases that could be linked more accurately to a knowledge base entry. + +Finally we use the count of occurrences of the candidate keyphrase as a proxy to its importance. This method is reliable for longer documents, as the repetition of a keyphrase tends to reliably indicate its centrality to the document’s topic.  + +## Named Entities + +Keyphrases are only one side of finding what’s important in a document. To further capture what a document is about, we must also consider the named entities that are present. + +Named Entity Extraction systems identify instances of named entities in a text, which we can count in order to represent their importance in the document, similar to how we did with keyphrases. + +Naively counting named entities through exact string matches surfaces an interesting problem: a single entity may go by many names or aliases, which means string frequency is an unreliable measurement of importance. In the example given in Figure 2, we know that “MIll”, “John Stuart Mill” and “Stuart Mill” all refer to the same person. This means that Mill is even more central to the document than the table indicates, since he is referred to a total of 8 times instead of 5. + + +
+ Figure 2: Excerpt from John Stuart Mill’s Wikipedia page (left) and Top 5 Named Entity counts of the first few paragraphs (right). +
Figure 2: Excerpt from John Stuart Mill’s Wikipedia page (left) and Top 5 Named Entity counts of the first few paragraphs (right).
+
+ +To address this counting problem, let's introduce a few abstractions: + +- `Named Entity` refers to a unique person, place or organization. Because of their uniqueness, we can represent them with a unique identifier (ID).  + +- `Named Entity Alias` (or simply Alias), is one of possibly many names associated with a particular entity. + +- `Canonical Alias` is the preferred name for an entity. + +- `Named Entity Mention` (or simply `Mention`), refers to each occurrence in a text that a Named Entity was referred to, regardless of which Alias was used. + +- `Knowledge Base` is a collection of entities, allowing us to query for ID, canonical name, aliases and other information that might be relevant for the task at hand. One example is [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page). + +The first step to solve the counting problem is to normalize the names a document uses to refer to a named entity. Using our abstractions, this means we want to find all the mentions in a document, and use its alias to find the named entity it belongs to. Then, replace it with either the canonical name or the named entity ID - this distinction will become clearer later on. + +### Entity Normalization + +Given a set of aliases that appear in a document, we developed heuristics (e.g. common tokens, initials) to identify which subset of aliases refer to the same named entity. This allowed us to limit our search space when comparing aliases. + +Using our previous example to illustrate this method, we start by assuming the canonical alias is the longest alias in a text for a given entity, and attempt to merge aliases together by evaluating which aliases match the heuristics we developed.  + +
+ Table 1: Top 5 occurring aliases in the first few paragraphs of John Stuart Mill’s Wikipedia page, some referring to the same person.
+ +
Table 1: Top 5 occurring aliases in the first few paragraphs of John Stuart Mill’s Wikipedia page, some referring to the same person. +
+
+ +Comparing entities with each other using exact token matching as a heuristic would solve this: + +
+ Table 2: Pairwise alias comparisons and resulting merges. Matches highlighted in bold. +
Table 2: Pairwise alias comparisons and resulting merges. Matches highlighted in bold. +
+
+ +By replacing all mentions with its corresponding canonical alias, we are able to find the correct named entity counts. + +One edge case is when an alias might refer to more than one entity: e.g. the alias “Potter” could refer to the named entities “Harry Potter” or “James Potter” within the Harry Potter universe. To solve this, we built an Entity Linker, which determines which named entity is the most likely to match the alias given the context. This process is further explained in the Linking to a Knowledge Base section. + +When an entity is not present in a knowledge base, we cannot use Named Entity Linking to disambiguate. In this case, our solution uses a fallback method that assigns the ambiguous mention (Potter) to the closest occurring unambiguous mention that matches the heuristics (e.g. Harry).  + +## Linking to a Knowledge Base + +Given that many keyphrases and entities mentioned in a document are notable, they are likely present in a knowledge base. This allows us to leverage extra information present in the knowledge base to improve the normalization step as well as downstream tasks. + +Entity Linking assists normalization by providing information that an alias matches a named entity, which otherwise wouldn't match a heuristic (e.g. “Honest Abe” versus “Abraham Lincoln”). Furthermore, [information in a knowledge base can be used to embed linked entities and keyphrases in the same space as text](https://arxiv.org/abs/1601.01343). + +Being able to embed entities in the same space as text is useful, as this unlocks the ability to [compare possible matching named entity IDs with the context in which they’re mentioned](https://arxiv.org/abs/1911.03814), and make a decision on whether an alias we’re considering might be one of the entities in the knowledge base (in which case we will use IDs), or whether the alias doesn't match any entity in the knowledge base, in which case we fall back to using the assumed canonical alias.  + +At Scribd we make use of Entity Linking to not only improve the Entity Normalization step, but also to take advantage of entity and keyphrase embeddings as supplemental features. + +## Discussion + +Putting all of this together, we can: + +1. Link documents to keyphrases and entities + +1. Find the relative importance of each in a document + +1. Take advantage of relevant information in knowledge bases + +This has enabled some interesting projects: + +In one of them, the Applied Research team built a graph of documents along with their related keyphrases and entities. Embedding documents, keyphrases and entities in the same space allowed us to discover documents by analogy. For example, take `The Count of Monte Cristo` by Alexandre Dumas, a 19th century French novel about revenge. If we add to its embedding the embedding of `science_fiction`, it leads us to a collection of science fiction novels by Jules Verne (another 19th century French author), such as `20,000 Leagues Under the Sea` and `Journey to the Center of the Earth`. + +Keyphrase extractions have also been useful in adding clarity to document clusters. By extracting the most common keyphrases of a cluster, we can derive a common theme for the cluster’s content: + + +
+ Figure 3: Top keyphrases in a document cluster. The keywords imply that the documents therein are related to dentistry & healthcare, which was confirmed by manually inspecting the documents. +
Figure 3: Top keyphrases in a document cluster. The keywords imply that the documents therein are related to dentistry & healthcare, which was confirmed by manually inspecting the documents.
+
+ +In yet another project, the team leveraged precomputed knowledge base embeddings to represent a document in space through a composition of the entities and keyphrases it contains. These features allowed us to understand the documents uploaded by our users and improve the content discovery on the platform. + +To see how we use the information extracted to classify documents into a +taxonomy, make sure to check out [part 3](/blog/2021/categorizing-user-uploaded-documents.html). + +If you're interested to learn more about the problems Applied Research +is solving, or the systems which are built around those solutions, +check out [our open positions!](/careers/#open-positions) + diff --git a/_posts/2021-07-28-categorizing-user-uploaded-documents.md b/_posts/2021-07-28-categorizing-user-uploaded-documents.md new file mode 100644 index 0000000..ddd2349 --- /dev/null +++ b/_posts/2021-07-28-categorizing-user-uploaded-documents.md @@ -0,0 +1,140 @@ +--- +layout: post +title: "Categorizing user-uploaded documents" +tags: +- machinelearning +- data +- featured +- kyc-series +team: Applied Research +author: moniquec +--- + +Scribd offers a variety of publisher and user-uploaded content to our users and +while the publisher content is rich in metadata, user-uploaded content +typically is not. Documents uploaded by the users have varied subjects and +content types which can make it challenging to link them together. One way to +connect content can be through a taxonomy - an important type of structured +information widely used in various domains. In this series, we have already +shared how we [identify document +types](/blog/2021/identifying-document-types.html) and [extract information +from documents](/blog/2021/information-extraction-at-scribd.html), this post +will discuss how insights from data were used to help build the taxonomy and +our approach to assign categories to the user-uploaded documents. + + +## Building the taxonomy + +The unified taxonomy is a tree-structure with two layers that was designed by combining our Subject Matter Experts' (SME) knowledge of the book industry subject headings ([BISAC](https://bisg.org/page/BISACEdition) categories) and data-driven insights. We used user-reading patterns to find topics that could help enrich our unified taxonomy. + +### Data-Driven Insights + +Users have been interacting with Scribd content for more than 10 years, building reading patterns throughout time. We leveraged these reading patterns to create dense vector representations of documents similarly to word2vec in text. + +
+ Schematic representation of our approach: reading sequences are used to create vector representations for user uploaded documents. The vector dimension shown is merely illustrative. +
Figure 1: Schematic representation of our approach: reading sequences are used to create vector representations for user uploaded documents. The vector dimension shown is merely illustrative.
+
+ +For this work we focused only on user uploaded documents and on one type of interaction (reading for a minimum amount of time). The embeddings dimensions (and other hyperparamenters) were chosen to optimize the hit-ratio@20 ([Caselles-Dupré, et al 2018](https://arxiv.org/abs/1804.04212)) increasing how semantically tight the embeddings are. + +Now that we have the embeddings we would like to use them to find groups of documents with similar subjects and topics. Finding these groups will help us identify categories that should be added to the taxonomy. + +Dimensionality reduction allows for dense clusters of documents to be found more efficiently and accurately in the reduced space in comparison to the original high-dimensional space of our embeddings. We reduced the dimension of the embeddings using the [t-SNE](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html) algorithm. t-SNE has a non-linear approach that can capture the smaller relationships between the points, as well as the global structure of the data. We used an implementation of t-SNE (Fast Fourier Transform accelerated Interpolation-based t-SNE” - [FIt-SNE](https://github.com/KlugerLab/FIt-SNE)) that is flexible and does not sacrifice accuracy for speed. + +Finally, we grouped the user-uploaded docs by clustering the reduced embeddings using [HDBSCAN](https://arxiv.org/pdf/1709.04545.pdf). HDBSCAN separates data points into clusters based on the density distribution. It also has a feature to detect noise, which are points that are too far from the nearest detected cluster to belong to it, and lack the density to form their own cluster. + +Figure 2 shows the 2D representation of the user-uploaded documents and their groups. The first thing we noticed and is highlighted in this figure is that the major groups are usually represented by language. Not surprisingly users tend to read content mostly on one single language. + +
+ Figure 2: Initial 2D representation of the embeddings using t-SNE and HDBSCAN. Each colored group represents a cluster found by HDBSCAN. Spread grey points were identified as noise. +
Figure 2: Initial 2D representation of the embeddings using t-SNE and HDBSCAN. Each colored group represents a cluster found by HDBSCAN. Spread grey points were identified as noise.
+
+ +We developed a technique to further split the groups above in smaller clusters that are semantically tighter. The final clusters can be seen in Figure 3. + +
+ Figure 3: Final 2D representation of the embeddings after further splitting of each cluster. Each colored group represents a subcluster found by HDBSCAN for a particular cluster. Spread grey points were identified as noise. +
Figure 3: Final 2D representation of the embeddings after further splitting of each cluster. Each colored group represents a subcluster found by HDBSCAN for a particular cluster. Spread grey points were identified as noise.
+
+ +After we got the clusters and subclusters shown in Figure 3, an inspection of the English subclusters was performed in order to identify their major subjects and themes. This investigation led to the incorporation of additional categories into the taxonomy, such as Philippine law, Study aids & test prep, and Teaching methods & materials, making the taxonomy broader across different content types and the browsing to this content more straightforward. + +## Placing documents into categories + +
+ Figure 4: Diagram of Scribd’s multi-component pipeline. Categorization is one of the downstream tasks highlighted in the diagram. +
Figure 4: Diagram of Scribd’s multi-component pipeline. Categorization is one of the downstream tasks highlighted in the diagram.
+
+ +Now that we have the taxonomy, it is time to place the documents into categories. Our approach leverages the extracted key phrases and entities discussed in [part II](/blog/2021/information-extraction-at-scribd.html) of the series. Figure 5 illustrates how our model works: we trained a supervised model to place documents identified as text-heavy (see [part I](/blog/2021/identifying-document-types.html)) into categories using key phrases, entities and the text. + +
+ Figure 5: Model architecture to categorize docs. +
Figure 5: Model architecture to categorize docs.
+
+ +### Additional insights from data + +In the first iteration of the model, we had a dataset for training collected by our experts to fit the definition of each category. Not surprisingly, upon testing the model on unseen data in production, we realized that for some categories the training set was not a complete representation of the type of documents in production that could fit them. For this reason, the model was unable to generalize with the initial given training set. As an example, in the initial training set most documents about countries other than the US were documents about travel. This means that the model learned that whenever a document mentions other countries, the document is most likely about travel. For this reason, documents about business in South America, for instance, would be placed under travel by the model. + +We applied a technique sometimes referred to as active learning to supplement our training set with the missing examples. Following this technique (Figure 6), the model is applied to a random sample of documents and the results analyzed by our SMEs. + +
+ Figure 6: Active Learning Process used to improve model performance. +
Figure 6: Active Learning Process used to improve model performance.
+
+ +This iterative process had two outcomes: improved the categories performance by re-training the model with a large variety of training example and the addition of a new category after we identified that a good fraction of documents fitted this particular category, + +## Additional Experiments + +Throughout this project several experiments were performed to explore the full potential of the user interaction clusters. Here we will show one exciting example of such experiment. + +#### Giving names to clusters + +As explained above, in general, each subcluster shown in figure 3 is semantically tight which means that the documents belonging to a subcluster are usually about one (or few) topic(s)/subject(s). + +One way to associate topics to the subclusters would require Subject Matter Experts to manually inspect the documents in each subcluster and come up with the most important topics for each of them. However, this approach is not only time consuming, and thus not scalable with new iterations of the model and a likely increasing number of clusters. It is very important to try and make this a more automatic and flexible process. + +We experimented with a very promising two-step approach to automatically assign topics to subclusters. In this approach, we leverage the extracted information from the text described in [part II](/blog/2021/information-extraction-at-scribd.html) and zero-shot topic classification (more info [here](https://arxiv.org/abs/1909.00161)): + +Step 1 - Find the subclusters' most representative key phrases by clustering their documents' extracted info. + +
+ Figure 7: Illustration of Step 1. +
Figure 7: Illustration of Step 1.
+
+ +Step 2 - Use the result of step 1 and zero-shot topic classification to find the highest ranking topics for each subcluster. + +
+ Figure 8: Illustration of Step 2. The bar plot with the highest ranking topics is the result of this approach for a subcluster that contains essays about several literary works. +
Figure 8: Illustration of Step 2. The bar plot with the highest ranking topics is the result of this approach for a subcluster that contains essays about several literary works.
+
+ +As it can be seen in figure 8, a cluster composed of literary works' essays has as the highest ranking topic literary criticism showing the potential of this approach for automatically giving names to user interaction clusters. + +## Conclusion + +Two important takeaways from this journey of categorizing documents were: + +**High quality labeled data** - We found that clean and consistently labelled data was much more important to the model than hyperparameter tuning. However, getting enough documents that fit the categories in our diverse corpus was a challenge. Several techniques were used to improve model performance on unseen data. Among them, active learning proved to be an important way to collect additional training samples and to guarantee the required granularity in the training set. + +**Annotation alignment** - High quality data and model performance are both connected to the annotation process (see more [here](https://www.youtube.com/watch?v=06-AZXmwHjo)). When multiple annotators are involved in the data collection and evaluation, alignment on the definition of each category is crucial for an accurate training and evaluation of the model. This is even more essential in text classification, since associating categories/topics to a text can be a very subjective task, specially when we are dealing with a single-label categorization problem. + +This project was an important milestone in understanding our user-uploaded documents: Classifying documents has enabled users to browse documents by category from our unified taxonomy. Additionally, we now have the power of understanding the categories that each user is interested in and interacts with. Combining the user interests with business metrics could help drive innovative and unexpected product decisions as well as enrich discoverability and recommendations. + +## Next Steps + +**Improve taxonomy using a data driven approach:** + +Moving forward, how can we make sure that newly uploaded documents are covered in our taxonomy? + +Using a data driven approach to build the taxonomy answers these questions and guarantees more flexibility, comprehensiveness, and specificity as opposed to a manually created taxonomy. As new content is uploaded to our platform and read by users, new user interaction clusters will form and help us identify recent user interests. For instance, during the pandemic, users started uploading documents related to Covid-19. Clustering the documents in 2021 for example, yields an additional cluster related to Covid-19, one that did not exist prior to the pandemic. This approach will help us build a less rigid taxonomy, a taxonomy that reflects Scribd’s vast content and is easily expandable in the long run. + +**Multi-language:** + +Now that we understand more our user-uploaded content in English and that we have a consistent pipeline to give labels to these documents, we can extend this approach to other languages + +This work and post were done in collaboration with my colleague [Antonia Mouawad](https://ca.linkedin.com/in/antoniamouawad) on the Applied Research team. If you're interested to learn more about the problems Applied Research is solving, or the systems which are built around those solutions, check out [our open positions](/careers/#open-positions). diff --git a/_posts/2021-09-29-android-audio-player-tutorial-with-armadillo.md b/_posts/2021-09-29-android-audio-player-tutorial-with-armadillo.md new file mode 100644 index 0000000..5f17cad --- /dev/null +++ b/_posts/2021-09-29-android-audio-player-tutorial-with-armadillo.md @@ -0,0 +1,92 @@ +--- +layout: post +title: "Armadillo makes audio players in Android easy" +tags: +- android +- kotlin +- armadillo +- featured +author: nathans +team: Android +--- + +Armadillo is the fully featured audio player library Scribd uses to play and +download all of its audiobooks and podcasts, which is [now open +source](https://github.com/scribd/armadillo). It specializes in playing HLS +or MP3 content that is broken down into chapters or tracks. It leverages +[Google’s Exoplayer](https://github.com/google/ExoPlayer/) library for its audio engine. Exoplayer wraps a variety of +low level audio and video apis but has few opinions of its own for actually +using audio in an Android app. + +![Armadillo Image](https://raw.githubusercontent.com/scribd/armadillo/main/armadillo.webp) + +The leap required from Exoplayer to audio player +is enormous both in terms of the amount of code needed as well as the amount of +domain knowledge required about complex audio related subjects. Armadillo +provides a turn-key solution for powering an audio player and providing the +information to update a UI. + +- **Easy-to-use** because it outputs state updates with everything needed for a UI or analytics. Works in the background state. +- **Effective** because it uses Google’s Exoplayer as the playback engine. +- **Ready-to-go** out of the box usage for a developer looking to use an audio player. +- **Robust** because it contains numerous configuration options for supporting most any requirement and includes a number of other android apis +required for a high quality audio player. + +## What does it include? + +- Support for HLS and MP3 audio +- Exoplayer for downloading and playback +- [MediaBrowserService](https://developer.android.com/reference/android/service/media/MediaBrowserService) so the app can be played in the background, browsed by other apps, and integrated with Android Auto. +- [MediaSession](https://developer.android.com/reference/android/media/session/MediaSession) to support commands from media controllers, ex. a bluetooth headset. + +## Getting Started: + +The library is hosted with Github packages so you will need to add the Github registry with authentication to your build.gradle file. See the official docs on authenticating [here](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-gradle-registry#authenticating-to-github-packages). But you will need to: + +1. Generate a [personal access token](https://docs.github.com/en/github/authenticating-to-github/keeping-your-account-and-data-secure/creating-a-personal-access-token) from your Github account. +1. Add the Github package registry with authentication to your `build.gradle` file. + +```kotlin +maven { + name = "GitHubPackages" + url = uri("https://maven.pkg.github.com/scribd/armadillo-and") + credentials { + username = "github_username" + password = "github_access_token" + } +} +``` + +It is as easy as adding this code snippet to your Activity / Fragment to play your first piece of content. + +```kotlin +// construct your media +val media = AudioPlayable( + id = 0, + title = "Google Hosted Mp3", + request = AudioPlayable.MediaRequest.createHttpUri("https://storage.googleapis.com/exoplayer-test-media-0/play.mp3"), + chapters = emptyList() +) + +// initialize the player +val armadilloPlayer = ArmadilloPlayerFactory.init() + +// begin playback +armadilloPlayer.beginPlayback(media) + +// listen for state updates +armadilloPlayer.armadilloStateObservable.subscribe { + + // update your UI here + +} +``` + +That’s all you need to get started! + +## Next Steps: + +For a more complex example, please see the [TestApp](https://github.com/scribd/armadillo/tree/main/TestApp) included in the library. If +you have any problems, don’t be afraid to open up an issue [on +GitHub](https://github.com/scribd/armadillo). + diff --git a/_posts/2022-04-28-data-ai-summit-2022.md b/_posts/2022-04-28-data-ai-summit-2022.md new file mode 100644 index 0000000..8916901 --- /dev/null +++ b/_posts/2022-04-28-data-ai-summit-2022.md @@ -0,0 +1,45 @@ +--- +layout: post +title: "Scribd is presenting at Data and AI Summit 2022" +team: Core Platform +author: rtyler +tags: +- databricks +- kafka +- deltalake +- featured +--- + +We are very excited to be presenting and attending this year's [Data and AI +Summit](https://databricks.com/dataaisummit/north-america-2022) which will be +hosted virtually and physically in San Francisco from June 27th-30th. +Throughout the course of 2021 we completed a number of really interesting +projects built around [delta-rs](https://github.com/delta-io/delta-rs) and the +Databricks platform which we are thrilled to share with a broader audience. +In addition to the presentations listed below, a number of Scribd engineers who +are responsible for data and ML platform, machine learning systems, and more, +will be in attendance if you want to meet up and learn more about how Scribd +uses data and ML to change the way the world reads! + + +* [Christian Williams](https://github.com/xianwill) will be sharing some of the +work he has done developing +[kafka-delta-ingest](https://github.com/scribd/kafka-delta-ingest) in his talk: +**[Streaming Data into Delta Lake with Rust and +Kafka](https://databricks.com/dataaisummit/north-america-2022/agenda/?sessionid=1834)** +* [QP Hou](https://github.com/houqp), Scribd Emeritus, will be presenting on +his foundational work to ensure correctness within delta-rs during his session: +**[Ensuring Correct Distributed Writes to Delta Lake in Rust with Formal +Verification](https://databricks.com/dataaisummit/north-america-2022/agenda/?sessionid=1623)** +* [R Tyler Croy](https://github.com/rtyler) will be co-presenting with Gavin +Edgley from Databricks on the cost analysis work Scribd has done to efficiently +grow our data platform with **[Doubling the size of the data lake without doubling the +cost](https://databricks.com/dataaisummit/north-america-2022/agenda/?sessionid=2366)** + + +There are so many great sessions to watch in person or online during the event, +particularly around [Delta Lake](https://delta.io), which is one of our +favorite technologies and powers our entire data platform. We are also +expecting some great ML related talks as data and ML begin to overlap more and +more. We hope to see you there! + diff --git a/_posts/2022-06-28-databricks-serverless.md b/_posts/2022-06-28-databricks-serverless.md new file mode 100644 index 0000000..1007238 --- /dev/null +++ b/_posts/2022-06-28-databricks-serverless.md @@ -0,0 +1,58 @@ +--- +layout: post +title: "Accelerating Looker with Databricks SQL Serverless" +tags: +- looker +- databricks +- featured +team: Core Platform +author: hamiltonh +--- + +We recently migrated Looker to a Databricks SQL Serverless, improving our +infrastructure cost and reducing the footprint of infrastructure we need to +worry about! “Databricks SQL” which provides a single load balanced Warehouse +for executing Spark SQL queries across multiple Spark clusters behind the +scenes. “Serverless” is an evolution of that concept, rather than running a SQL +Warehouse in our AWS infrastructure, the entirety of execution happens on the +Databricks side. With a much simpler and faster interface, queries executed in +Looker now return results much faster to our users than ever before! + +When we originally provisioned our “Databricks SQL” warehouses, we worked +together with our colleagues at Databricks to ensure [the terraform provider +for Databricks](https://github.com/databricks/terraform-provider-databricks) is +ready for production usage, which as of today is Generally Available. That +original foundation in Terraform allowed us to more easily adopt SQL Serverless +once it was made available to us. + +```hcl +resource "databricks_sql_warehouse" "warehouse" { + name = "Looker Serverless" + # ... + enable_serverless_compute = true + # ... +} +``` + +The feature was literally brand new so there were a few integration hurdles we +had to work through with our colleagues at Databricks, but we got things up and +running in short order. By adopting SQL Serverless, we could avoid setting up +special networking, IAM roles, and other resources within our own AWS account, +we can instead rely on pre-provisioned compute resources within Databricks' own +infrastructure. No more headache of ensuring all of the required infra is in +place and setup correctly! + +The switch to Serverless reduced our infra configuration and management +footprint, which by itself is an improvement. We also noticed a significant +reduction in cold start times for the SQL Serverless Warehouse compared to the +standard SQL Warehouse. The faster start-up times meant we could configure even +lower auto-terminate times on the warehouse, savings us even more on +unproductive and idle cluster costs. + +On the Looker side there really wasn’t any difference in the connection +configuration other than a URL change. In the end, after some preparation work +a simple 5 minute change in Looker, and a simple 5 minute change in Terraform +switched everything over to Databricks SQL Serverless, and we were ready to +rock! Our BI team is very happy with the performance, especially on cold start +queries. Our CFO is happy about reducing infrastructure costs. And I’m happy +about simpler infrastructure! diff --git a/_posts/2022-07-21-data-ai-summit-videos.md b/_posts/2022-07-21-data-ai-summit-videos.md new file mode 100644 index 0000000..828f149 --- /dev/null +++ b/_posts/2022-07-21-data-ai-summit-videos.md @@ -0,0 +1,45 @@ +--- +layout: post +title: "Data and AI Summit Wrap-up" +team: Core Platform +author: rtyler +tags: +- databricks +- kafka +- deltalake +- featured +--- + +We brought a whole team to San Francisco to present and attend this year's Data and +AI Summit, and it was a blast! +I +would consider the event a success both in the attendance to the Scribd hosted +talks and the number of talks which discussed patterns we have adopted in our +own data and ML platform. +The three talks I [wrote about +previously](/blog/2022/data-ai-summit-2022.html) were well received and have +since been posted to YouTube along with _hundreds_ of other talks. + +* [Christian Williams](https://github.com/xianwill) shared some of the +work he has done developing +[kafka-delta-ingest](https://github.com/scribd/kafka-delta-ingest) in his talk: +[![Streaming Data into Delta Lake with Rust and Kafka](https://img.youtube.com/vi/do4jsxeKfd4/hqdefault.jpg)](https://www.youtube.com/watch?v=do4jsxeKfd4&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=195) +* [QP Hou](https://github.com/houqp), Scribd Emeritus, presented on +his foundational work to ensure correctness within delta-rs during his session: +[![Ensuring Correct Distributed Writes to Delta Lake in Rust with Formal +Verification](https://img.youtube.com/vi/ABoCnrVWCKY/hqdefault.jpg)](https://www.youtube.com/watch?v=ABoCnrVWCKY&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=112) +* [R Tyler Croy](https://github.com/rtyler) co-presented with Gavin +Edgley from Databricks on the cost analysis work Scribd has done to efficiently +grow our data platform with: +[![Doubling the size of the data lake without doubling the cost](https://img.youtube.com/vi/9QDRD0PzqCE/hqdefault.jpg)](https://www.youtube.com/watch?v=9QDRD0PzqCE&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=122) + +Members of the Scribd team participated in a panel to discuss the past, +present, and future of Delta Lake on the expo floor. We also took advantage of +the time to have multiple discussions with our colleagues at Databricks about +their product and engineering roadmap, and where we can work together to +improve the future of Delta Lake, Unity catalog, and more. + +For those working in the data, ML, or infrastructure space, there are a lot of +_great_ talks available online from the event, which I highly recommend +checking out. Data and AI Summit is a great event for leaders in the industry +to get together, so we'll definitely be back next year! diff --git a/_posts/2024-02-05-evolution-of-mlplatform.md b/_posts/2024-02-05-evolution-of-mlplatform.md new file mode 100644 index 0000000..37f22c2 --- /dev/null +++ b/_posts/2024-02-05-evolution-of-mlplatform.md @@ -0,0 +1,133 @@ +--- +layout: post +title: "The Evolution of the Machine Learning Platform" +team: Machine Learning Platform +author: bshaw +tags: +- mlops +- featured +- ml-platform-series +--- + +Machine Learning Platforms (ML Platforms) have the potential to be a key component in achieving production ML at scale without large technical debt, yet ML Platforms are not often understood. This document outlines the key concepts and paradigm shifts that led to the conceptualization of ML Platforms in an effort to increase an understanding of these platforms and how they can best be applied. + + +Technical Debt and development velocity defined +----------------------------------------------- + +### Development Velocity + +Machine learning development velocity refers to the speed and efficiency at which machine learning (ML) projects progress from the initial concept to deployment in a production environment. It encompasses the entire lifecycle of a machine learning project, from data collection and preprocessing to model training, evaluation, validation deployment and testing for new models or for re-training, validation and deployment of existing models. + +### Technical Debt + +The term "technical debt" in software engineering was coined by Ward Cunningham, Cunningham used the metaphor of financial debt to describe the trade-off between implementing a quick and dirty solution to meet immediate needs (similar to taking on financial debt for short-term gain) versus taking the time to do it properly with a more sustainable and maintainable solution (akin to avoiding financial debt but requiring more upfront investment). Just as financial debt accumulates interest over time, technical debt can accumulate and make future development more difficult and expensive. + +The idea behind technical debt is to highlight the consequences of prioritizing short-term gains over long-term maintainability and the need to address and pay off this "debt" through proper refactoring and improvements. The term has since become widely adopted in the software development community to describe the accrued cost of deferred work on a software project. + +### Technical Debt in Machine Learning + +Originally a software engineering concept, Technical debt is also relevant to Machine Learning Systems infact the landmark google paper suggest that ML systems have the propensity to easily gain this technical debt. + +> Machine learning offers a fantastically powerful toolkit for building useful complex prediction systems quickly. This paper argues it is dangerous to think of these quick wins as coming for free. Using the software engineering framework of technical debt , we find it is common to incur massive ongoing maintenance costs in real-world ML systems +> +> [Sculley et al (2021) Hidden Technical Debt in Machine Learning Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) + +> As the machine learning (ML) community continues to accumulate years of experience with livesystems, a wide-spread and uncomfortable trend has emerged: developing and deploying ML sys-tems is relatively fast and cheap, but maintaining them over time is difficult and expensive +> +> [Sculley et al (2021) Hidden Technical Debt in Machine Learning Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) + +Technical debt is important to consider especially when trying to move fast. Moving fast is easy, moving fast without acquiring technical debt is alot more complicated. + +The Evolution Of ML Platforms +----------------------------- + +### DevOps -- The paradigm shift that led the way + +DevOps is a methodology in software development which advocates for teams owning the entire software development lifecycle. This paradigm shift from fragmented teams to end-to-end ownership enhances collaboration and accelerates delivery. Dev ops has become standard practice in modern software development and the adoption of DevOps has been widespread, with many organizations considering it an essential part of their software development and delivery processes. Some of the principles of DevOps are: + +1. **Automation** + +2. **Continuous Testing** + +3. **Continuous Monitoring** + +4. **Collaboration and Communication** + +5. **Version Control** + +6. **Feedback Loops** + + +### Platforms -- Reducing Cognitive Load + +This shift to DevOps and teams teams owning the entire development lifecycle introduces a new challenge—additional cognitive load. Cognitive load can be defined as + +> The total amount of mental effort a team uses to understand, operate and maintain their designated systems or tasks. +> +> [Skelton & Pais (2019) Team Topologies](https://teamtopologies.com/book) + +The weight of the additional load introduced in DevOps of teams owning the entire software development lifecycle can hinder productivity, prompting organizations to seek solutions. + +Platforms emerged as a strategic solution, delicately abstracting unnecessary details of the development lifecycle. This abstraction allows engineers to focus on critical tasks, mitigating cognitive load and fostering a more streamlined workflow. + +> The purpose of a platform team is to enable stream-aligned teams to deliver work with substantial autonomy. The stream-aligned team maintains full ownership of building, running, and fixing their application in production. The platform team provides internal services to reduce the cognitive load that would be required from stream-aligned teams to develop these underlying services. +> +> [Skelton & Pais (2019) Team Topologies](https://teamtopologies.com/book) + +> Infrastructure Platform teams enable organisations to scale delivery by solving common product and non-functional requirements with resilient solutions. This allows other teams to focus on building their own things and releasing value for their users +> +> [Rowse & Shepherd (2022) Building Infrastructure Platforms](https://martinfowler.com/articles/building-infrastructure-platform.html) + +### ML Ops -- Reducing technical debt of machine learning + +The ability of ML systems to rapidly accumulate technical debt has given rise to the concept of MLOps. MLOps is a methodology that takes inspiration from and incorporates best practices of the DevOps, tailoring them to address the distinctive challenges inherent in machine learning. MLOps applies the established principles of DevOps to machine learning, recognizing that merely a fraction of real-world ML systems comprises the actual ML code. Serving as a crucial bridge between development and the ongoing intricacies of maintaining ML systems. +MLOps is a methodology that provides a collection of concepts and workflows designed to promote efficiency, collaboration, and sustainability of the ML Lifecycle. Correctly applied MLOps can play a pivotal role controlling technical debt and ensuring the efficiency, reliability, and scalability of the machine learning lifecycle over time. + +Scribd's ML Platform -- MLOps and Platforms in Action +------------------------------------- +At Scribd we have developed a machine learning platform which provides a curated developer experience for machine learning developers. This platform has been built with MLOps in mind which can be seen through its use of common DevOps principles. + +1. **Automation:** + * Applying CI/CD strategies to model deployments through the use of Jenkins pipelines which deploy models from the Model Registry to AWS based endpoints. + * Automating Model training throug the use of Airflow DAGS and allowing these DAGS to trigger the deployment pipelines to deploy a model once re-training has occured. + +2. **Continuous** **Testing:** + * Applying continuous testing as part of a model deployment pipeline, removing the need for manual testing. + * Increased tooling to support model validation testing. + +3. **Monitoring:** + * Monitoring real time inference endpoints + * Monitoring training DAGS + * Monitoring batch jobs + +4. **Collaboration and Communication:** + * Feature Store which provides feature discovery and re-use + * Model Database which provides model collaboration + +6. **Version Control:** + * Applying version control to experiments, machine learning models and features + + +References +---------- + +[Bottcher (2018, March 05). What I Talk About When I Talk About Platforms. https://martinfowler.com/articles/talk-about-platforms.html](https://martinfowler.com/articles/talk-about-platforms.html) + +[D. Sculley, Gary Holt, Daniel Golovin, Eugene Davydov, Todd Phillips, Dietmar Ebner, Vinay Chaudhary, Michael Young, Jean-Franc¸ois Crespo, Dan Dennison (2021) Hidden Technical Debt in Machine Learning Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems) + +[Fowler (2022, October 20).Conway's Law. https://martinfowler.com/bliki/ConwaysLaw.html](https://martinfowler.com/bliki/ConwaysLaw.html) + +[Galante, what is platform engineering. https://platformengineering.org/blog/what-is-platform-engineering](https://platformengineering.org/blog/what-is-platform-engineering) + +[Humanitect, State of Platform Engineering Report](https://www.scribd.com/document/611845499/Whitepaper-State-of-Platform-Engineering-Report) + +[Hodgson (2023, July 19).How platform teams get stuff done. https://martinfowler.com/articles/platform-teams-stuff-done.html](https://martinfowler.com/articles/platform-teams-stuff-done.html) + +[Murray (2017, April 27. The Art of Platform Thinking. https://www.thoughtworks.com/insights/blog/platforms/art-platform-thinking)](https://www.thoughtworks.com/insights/blog/platforms/art-platform-thinking) + +[Rouse (2017, March 20). Technical Debt. https://www.techopedia.com/definition/27913/technical-debt](https://www.techopedia.com/definition/27913/technical-debt) + +[Rowse & Shepherd (2022).Building Infrastructure Platforms. https://martinfowler.com/articles/building-infrastructure-platform.html](https://martinfowler.com/articles/building-infrastructure-platform.html) + +[Skelton & Pais (2019) Team Topologies](https://teamtopologies.com/book) diff --git a/_posts/2025-01-15-cloud-native-data-ingestion.md b/_posts/2025-01-15-cloud-native-data-ingestion.md new file mode 100644 index 0000000..2df1b88 --- /dev/null +++ b/_posts/2025-01-15-cloud-native-data-ingestion.md @@ -0,0 +1,35 @@ +--- +layout: post +title: "Cloud-native Data Ingestion with AWS Aurora and Delta Lake" +team: "Core Infrastructure" +author: rtyler +tags: +- deltalake +- rust +- featured +--- + + +One of the major themes for Infrastructure Engineering over the past couple +years has been higher reliability and better operational efficiency. In a +recent session with the [Delta Lake](https://delta.io) project I was able to +share the work led Kuntal Basu and a number of other people to _dramatically_ +improve the efficiency and reliability of our online data ingestion pipeline. + + +> Join Kuntal Basu, Staff Infrastructure Engineer, and R. Tyler Croy, Principal +> Engineer at Scribd, Inc. as they take you behind the scenes of Scribd’s data +> ingestion setup. They’ll break down the architecture, explain the tools, and +> walk you through how they turned off-the-shelf solutions into a robust +> pipeline. + + +## Video + +
+ + +## Presentation + +
+ diff --git a/_posts/2025-03-14-terraform-oxbow-module.md b/_posts/2025-03-14-terraform-oxbow-module.md new file mode 100644 index 0000000..ab48af2 --- /dev/null +++ b/_posts/2025-03-14-terraform-oxbow-module.md @@ -0,0 +1,61 @@ +--- +layout: post +title: "Terraform module to manage Oxbow Lambda and its components" +tags: +- Oxbow +- Terraform +- AWS +- deltalake +- rust +team: Core Infrastructure +author: Oleh Motrunych +--- + + +[Oxbow](https://github.com/buoyant-data/oxbow) is a project to take an existing storage location which contains [Apache Parquet](https://parquet.apache.org/) files into a [Delta Lake table](https://delta.io/). +It is intended to run both as an AWS Lambda or as a command line application. +We are excited to introduce [terraform-oxbow](https://github.com/scribd/terraform-oxbow), an open-source Terraform module that simplifies the deployment and management of AWS Lambda and its supporting components. Whether you're working with AWS Glue, Kinesis Data Firehose, SQS, or DynamoDB, this module provides a streamlined approach to infrastructure as code (IaC) in AWS. + +### ✨ Why terraform-oxbow? +Managing event-driven architectures in AWS can be complex, requiring careful orchestration of multiple services. Terraform-oxbow abstracts much of this complexity, enabling users to configure key components with simple boolean flags and module parameters. This is an easy and efficient way to have Delta table created using Apache Parquet files. +### 🚀Features + +With **terraform-oxbow**, you can deploy: + +- AWS Oxbow Lambda with customizable configurations +- Kinesis Data Firehose for real-time data streaming +- SQS and SQS Dead Letter Queues for event-driven messaging +- IAM policies for secure access management +- S3 bucket notifications to trigger Lambda functions +- DynamoDB tables for data storage and locking +- AWS Glue Catalog and Tables for schema management + + +### ⚙️ How It Works + +This module follows a modular approach, allowing users to enable or disable services based on their specific use case. Here are a few examples: + +- To enable AWS Glue Catalog and Tables: ```hcl +enable_aws_glue_catalog_table = true +``` + +- To enable Kinesis Data Firehose delivery stream ```hcl +enable_kinesis_firehose_delivery_stream = true +``` + +- To enable S3 bucket notifications ```hcl +enable_bucket_notification = true +``` + +- To enable advanced Oxbow Lambda setup for multi-table filtered optimization ```hcl +enable_group_events = true +``` + +- AWS S3 bucket notifications have limitations: Due to AWS constraints, an S3 bucket can only have a single notification configuration per account. If you need to trigger multiple Lambda functions from the same S3 bucket, consider using event-driven solutions like SNS or SQS. + + +- IAM Policy Management: The module provides the necessary permissions but follows the principle of least privilege. Ensure your IAM policies align with your security requirements. + + +- Scalability and Optimization: The module allows fine-grained control over Lambda concurrency, event filtering, and data processing configurations to optimize costs and performance + diff --git a/assets/_sass/component/_post-content.scss b/assets/_sass/component/_post-content.scss index ff4a7bf..49466f3 100644 --- a/assets/_sass/component/_post-content.scss +++ b/assets/_sass/component/_post-content.scss @@ -4,17 +4,13 @@ // Bump of base font size for a comfortable reading experience .post-content { - font-size: 1rem; + // actually 17.875px + font-size: rem-calc(22px); // make sure really long words and links wrap word-wrap: break-word; overflow-wrap: break-word; - @media (min-width: $bp-sm) { - font-size: rem-calc(18px); - } - @media (min-width: $bp-lg) { - font-size: rem-calc(20px); line-height: 1.6; } } diff --git a/assets/js/jobs.js b/assets/js/jobs.js index 829719b..8172cb9 100644 --- a/assets/js/jobs.js +++ b/assets/js/jobs.js @@ -5,7 +5,7 @@ * * With that disclaimer out of the way... * - * This file handles the fetching of jobs from Lever such that they can be + * This file handles the fetching of jobs from Lever^WAshby such that they can be * dynamically inserted into different parts of the tech blog */ @@ -13,7 +13,7 @@ * This API will return an list of departments which must then be filtered * through to find the .postings under each */ -const API_URL = 'https://api.lever.co/v0/postings/scribd?group=department&mode=json' +const API_URL = 'https://api.ashbyhq.com/posting-api/job-board/scribd?includeCompensation=true' /* @@ -37,21 +37,20 @@ function fetchJobs() { return fetch(API_URL) .then(async (response) => { - const departments = await response.json(); + const board = await response.json(); /* * Since this is the tech blog, we're only pulling a couple of * departments */ - departments - .filter(d => ['Engineering', 'Data Science', 'Design'].includes(d.title)) - .forEach((department) => { - department.postings.forEach((posting) => { - const team = posting.categories.team; + board.jobs + .filter(j => ['Engineering', 'Product, Design, & Analytics', 'Product'].includes(j.department)) + .filter(j => !j.title.toLowerCase().includes('marketing')) + .forEach((job) => { + const team = job.team; if (!window.jobsCache[team]) { window.jobsCache[team] = []; } - window.jobsCache[team].push(posting); - }); + window.jobsCache[team].push(job); }); window.jobsFetched = true; return window.jobsCache; @@ -98,9 +97,9 @@ function renderJobs(elem, team, randomLimit) { li.innerHTML = `
- ${job.text} + ${job.title}
-

${job.categories.location || ''}

+

${job.location || ''}

`; elem.appendChild(li); diff --git a/careers.html b/careers.html index e70f715..5a5072a 100644 --- a/careers.html +++ b/careers.html @@ -18,7 +18,7 @@ alt="two people sitting around a sofa reading on a computer and tablet">
-

Help us change the way the world reads.

+

Help us build the largest and most accessible library connecting storytellers with their audience.

Our readers are on a mission to become their best selves, and so are we. We’re not afraid to take risks because we know that — win or lose — we’ll learn from them.

If you’re a talented team player and want to work somewhere where your input matters, we’d love to talk with you.

@@ -60,7 +60,7 @@

We believe that the secret to making the perfect product is
-

Benifits

+

Benefits

Our team takes great care of us, in return, we take great care of them.

@@ -159,14 +159,14 @@

${team} li.innerHTML = `
- - ${job.text} + + ${job.title}
-

${job.categories.location || ''}

+

${job.location || ''}

- `; diff --git a/post-images/2020-12-sidekiq-incident-learnings/overall-sidekiq-dashboard.png b/post-images/2020-12-sidekiq-incident-learnings/overall-sidekiq-dashboard.png new file mode 100644 index 0000000..65d939e Binary files /dev/null and b/post-images/2020-12-sidekiq-incident-learnings/overall-sidekiq-dashboard.png differ diff --git a/post-images/2020-12-sidekiq-incident-learnings/overall-worker-metrics.png b/post-images/2020-12-sidekiq-incident-learnings/overall-worker-metrics.png new file mode 100644 index 0000000..782722e Binary files /dev/null and b/post-images/2020-12-sidekiq-incident-learnings/overall-worker-metrics.png differ diff --git a/post-images/2020-12-sidekiq-incident-learnings/sidekiq-disabled-workers.jpeg b/post-images/2020-12-sidekiq-incident-learnings/sidekiq-disabled-workers.jpeg new file mode 100644 index 0000000..3ee368c Binary files /dev/null and b/post-images/2020-12-sidekiq-incident-learnings/sidekiq-disabled-workers.jpeg differ diff --git a/post-images/2020-12-sidekiq-incident-learnings/sidekiq-dropped-workers.jpeg b/post-images/2020-12-sidekiq-incident-learnings/sidekiq-dropped-workers.jpeg new file mode 100644 index 0000000..df797e3 Binary files /dev/null and b/post-images/2020-12-sidekiq-incident-learnings/sidekiq-dropped-workers.jpeg differ diff --git a/post-images/2020-12-sidekiq-incident-learnings/sidekiq-incident-handling.png b/post-images/2020-12-sidekiq-incident-learnings/sidekiq-incident-handling.png new file mode 100644 index 0000000..8509899 Binary files /dev/null and b/post-images/2020-12-sidekiq-incident-learnings/sidekiq-incident-handling.png differ diff --git a/post-images/2020-12-sidekiq-incident-learnings/sidekiq-web-ui.jpg b/post-images/2020-12-sidekiq-incident-learnings/sidekiq-web-ui.jpg new file mode 100644 index 0000000..bf9bff9 Binary files /dev/null and b/post-images/2020-12-sidekiq-incident-learnings/sidekiq-web-ui.jpg differ diff --git a/post-images/2020-12-sidekiq-incident-learnings/single-worker-dashboard.png b/post-images/2020-12-sidekiq-incident-learnings/single-worker-dashboard.png new file mode 100644 index 0000000..48cc9da Binary files /dev/null and b/post-images/2020-12-sidekiq-incident-learnings/single-worker-dashboard.png differ diff --git a/post-images/2021-03-github-datadog/all-velocity-metrics.png b/post-images/2021-03-github-datadog/all-velocity-metrics.png new file mode 100644 index 0000000..40442d4 Binary files /dev/null and b/post-images/2021-03-github-datadog/all-velocity-metrics.png differ diff --git a/post-images/2021-03-github-datadog/bad-workflow.png b/post-images/2021-03-github-datadog/bad-workflow.png new file mode 100644 index 0000000..05ed578 Binary files /dev/null and b/post-images/2021-03-github-datadog/bad-workflow.png differ diff --git a/post-images/2021-03-github-datadog/good-workflow.png b/post-images/2021-03-github-datadog/good-workflow.png new file mode 100644 index 0000000..585a887 Binary files /dev/null and b/post-images/2021-03-github-datadog/good-workflow.png differ diff --git a/post-images/2021-03-github-datadog/job-comparison.png b/post-images/2021-03-github-datadog/job-comparison.png new file mode 100644 index 0000000..e64c3ba Binary files /dev/null and b/post-images/2021-03-github-datadog/job-comparison.png differ diff --git a/post-images/2021-03-github-datadog/team-time-to-merge.png b/post-images/2021-03-github-datadog/team-time-to-merge.png new file mode 100644 index 0000000..e303961 Binary files /dev/null and b/post-images/2021-03-github-datadog/team-time-to-merge.png differ diff --git a/post-images/2021-03-sql-delta-import/missing_rows.png b/post-images/2021-03-sql-delta-import/missing_rows.png new file mode 100644 index 0000000..17ea648 Binary files /dev/null and b/post-images/2021-03-sql-delta-import/missing_rows.png differ diff --git a/post-images/2021-03-sql-delta-import/row_density_increase.png b/post-images/2021-03-sql-delta-import/row_density_increase.png new file mode 100644 index 0000000..5aa5123 Binary files /dev/null and b/post-images/2021-03-sql-delta-import/row_density_increase.png differ diff --git a/post-images/2021-04-ebr-scribd/f1.png b/post-images/2021-04-ebr-scribd/f1.png new file mode 100644 index 0000000..89e5fd0 Binary files /dev/null and b/post-images/2021-04-ebr-scribd/f1.png differ diff --git a/post-images/2021-04-ebr-scribd/f2.png b/post-images/2021-04-ebr-scribd/f2.png new file mode 100644 index 0000000..251ac7e Binary files /dev/null and b/post-images/2021-04-ebr-scribd/f2.png differ diff --git a/post-images/2021-04-ebr-scribd/f3.png b/post-images/2021-04-ebr-scribd/f3.png new file mode 100644 index 0000000..2aedf01 Binary files /dev/null and b/post-images/2021-04-ebr-scribd/f3.png differ diff --git a/post-images/2021-04-okta-airflow/airflow-login.png b/post-images/2021-04-okta-airflow/airflow-login.png new file mode 100644 index 0000000..8b59904 Binary files /dev/null and b/post-images/2021-04-okta-airflow/airflow-login.png differ diff --git a/post-images/2021-04-okta-airflow/okta-tiles.png b/post-images/2021-04-okta-airflow/okta-tiles.png new file mode 100644 index 0000000..25ab217 Binary files /dev/null and b/post-images/2021-04-okta-airflow/okta-tiles.png differ diff --git a/post-images/2021-04-okta-airflow/sample-okta-setup.png b/post-images/2021-04-okta-airflow/sample-okta-setup.png new file mode 100644 index 0000000..09dcc67 Binary files /dev/null and b/post-images/2021-04-okta-airflow/sample-okta-setup.png differ diff --git a/post-images/2021-05-kafka-delta-ingest/kafka-delta-ingest-deployment.png b/post-images/2021-05-kafka-delta-ingest/kafka-delta-ingest-deployment.png new file mode 100644 index 0000000..eac68ff Binary files /dev/null and b/post-images/2021-05-kafka-delta-ingest/kafka-delta-ingest-deployment.png differ diff --git a/tag/architecture/index.md b/tag/architecture/index.md new file mode 100644 index 0000000..ed856de --- /dev/null +++ b/tag/architecture/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: architecture" +tag: architecture +robots: noindex +--- diff --git a/tag/codegen/index.md b/tag/codegen/index.md new file mode 100644 index 0000000..c7d1a10 --- /dev/null +++ b/tag/codegen/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: codegen" +tag: codegen +robots: noindex +--- diff --git a/tag/databricks/index.md b/tag/databricks/index.md new file mode 100644 index 0000000..ac13519 --- /dev/null +++ b/tag/databricks/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: databricks" +tag: databricks +robots: noindex +--- diff --git a/tag/datadog/index.md b/tag/datadog/index.md new file mode 100644 index 0000000..b255b51 --- /dev/null +++ b/tag/datadog/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: datadog" +tag: datadog +robots: noindex +--- diff --git a/tag/deltalake/index.md b/tag/deltalake/index.md new file mode 100644 index 0000000..3f0a3a9 --- /dev/null +++ b/tag/deltalake/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: deltalake" +tag: deltalake +robots: noindex +--- diff --git a/tag/deploys/index.md b/tag/deploys/index.md new file mode 100644 index 0000000..3a395f1 --- /dev/null +++ b/tag/deploys/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: deploys" +tag: deploys +robots: noindex +--- diff --git a/tag/eks/index.md b/tag/eks/index.md new file mode 100644 index 0000000..22df77f --- /dev/null +++ b/tag/eks/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: eks" +tag: eks +robots: noindex +--- diff --git a/tag/hotdog/index.md b/tag/hotdog/index.md new file mode 100644 index 0000000..b7c5cf3 --- /dev/null +++ b/tag/hotdog/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: hotdog" +tag: hotdog +robots: noindex +--- diff --git a/tag/kubernetes/index.md b/tag/kubernetes/index.md new file mode 100644 index 0000000..7c0f142 --- /dev/null +++ b/tag/kubernetes/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: kubernetes" +tag: kubernetes +robots: noindex +--- diff --git a/tag/lambda/index.md b/tag/lambda/index.md new file mode 100644 index 0000000..1c1a6e1 --- /dev/null +++ b/tag/lambda/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: lambda" +tag: lambda +robots: noindex +--- diff --git a/tag/mlops/index.md b/tag/mlops/index.md new file mode 100644 index 0000000..b51bead --- /dev/null +++ b/tag/mlops/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: mlops" +tag: mlops +robots: noindex +--- diff --git a/tag/real-time/index.md b/tag/real-time/index.md new file mode 100644 index 0000000..ce81e7e --- /dev/null +++ b/tag/real-time/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: real-time" +tag: real-time +robots: noindex +--- diff --git a/tag/rust/index.md b/tag/rust/index.md new file mode 100644 index 0000000..1354ad4 --- /dev/null +++ b/tag/rust/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: rust" +tag: rust +robots: noindex +--- diff --git a/tag/sidekiq/index.md b/tag/sidekiq/index.md new file mode 100644 index 0000000..e662140 --- /dev/null +++ b/tag/sidekiq/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: sidekiq" +tag: sidekiq +robots: noindex +--- diff --git a/tag/step function/index.md b/tag/step function/index.md new file mode 100644 index 0000000..a283fc5 --- /dev/null +++ b/tag/step function/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: step function" +tag: step function +robots: noindex +--- diff --git a/tag/syslog/index.md b/tag/syslog/index.md new file mode 100644 index 0000000..1cc51e6 --- /dev/null +++ b/tag/syslog/index.md @@ -0,0 +1,6 @@ +--- +layout: tag_page +title: "Tag: syslog" +tag: syslog +robots: noindex +---