diff --git a/.gitignore b/.gitignore
index 045c10b..90a4b88 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ _site
 .jekyll-metadata
 vendor
 *.sw*
+.idea/
diff --git a/CONTRIBUTING.adoc b/CONTRIBUTING.adoc
index a6570c4..1394986 100644
--- a/CONTRIBUTING.adoc
+++ b/CONTRIBUTING.adoc
@@ -94,8 +94,11 @@ link:https://jekyllrb.com/docs/frontmatter/[front matter]. The `layout`
 attribute tells the rendering engine to use the "post" layout.
 `title` will be the displayed title of the post.
 
-`tags` are descriptive terms for this post.
-They can be used to search for all posts for a specific or area,
+`tags` are descriptive terms for this post. You can make up new tags,
+but check the `tag` directory to see tags other posters have already
+used. If you use new tags, run the `generate-tags` script to populate
+the `tag` directory with the appropriate files.
+Tags can be used to search for all posts for a specific or area,
 such as "tutorials" or "plugins".
 Tags must contain only numbers and lower-case letters.
 Tags must not contain spaces.
@@ -104,8 +107,6 @@ Tags containing multiple words should squash all the words together,
 as in "continuousdelivery" or "jenkinsworld2017".
 Dashes are allowed but should be avoided unless describing a topic that contains
 dashes, such as a plugin name that contains dashes.
-To see tags people have used before:
-
 
 The `team` tag should correspond to one of the keys in `_data/teams.yml`. This
 will ensure that the post is categorized properly.
@@ -114,6 +115,10 @@ The `author` attribute will map your
 GitHub name to author information, if this is your first time adding a blog
 post, please also create an entry in `_data/authors.yml` with your author metadata.
 
+If this is your team's first contribution to the blog, add a file for your 
+team to the `_category` directory. This causes a quick link to all of your
+team's posts to appear in the bottom left frame of the blog home page.
+
 Once you have everything ready, you may
 link:https://help.github.com/articles/creating-a-pull-request/[create a pull
 request] containing your additions.
diff --git a/Gemfile b/Gemfile
index fd51732..f0f42ae 100644
--- a/Gemfile
+++ b/Gemfile
@@ -1,18 +1,21 @@
 source "https://rubygems.org/"
 
-gem "jekyll", "~> 3.8.5"
+#gem "jekyll", "~> 4.0"
 gem 'kramdown'
 gem 'rouge'
 
 # If you want to use GitHub Pages, remove the "gem "jekyll"" above and
 # uncomment the line below. To upgrade, run `bundle update github-pages`.
-gem "github-pages", "~> 201", group: :jekyll_plugins
+gem "github-pages", group: :jekyll_plugins
 
 # If you have any plugins, put them here!
 group :jekyll_plugins do
-  gem "jekyll-feed", "~> 0.11"
+  gem "jekyll-feed"
+  gem 'jekyll-paginate'
 end
 
+gem 'webrick'
+
 # Windows and JRuby does not include zoneinfo files, so bundle the tzinfo-data gem
 # and associated library.
 install_if -> { RUBY_PLATFORM =~ %r!mingw|mswin|java! } do
diff --git a/Gemfile.lock b/Gemfile.lock
index a243b05..9859e75 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -1,226 +1,252 @@
 GEM
   remote: https://rubygems.org/
   specs:
-    activesupport (4.2.11.1)
-      i18n (~> 0.7)
+    activesupport (6.0.6.1)
+      concurrent-ruby (~> 1.0, >= 1.0.2)
+      i18n (>= 0.7, < 2)
       minitest (~> 5.1)
-      thread_safe (~> 0.3, >= 0.3.4)
       tzinfo (~> 1.1)
-    addressable (2.7.0)
+      zeitwerk (~> 2.2, >= 2.2.2)
+    addressable (2.8.0)
       public_suffix (>= 2.0.2, < 5.0)
     coffee-script (2.4.1)
       coffee-script-source
       execjs
     coffee-script-source (1.11.1)
     colorator (1.1.0)
-    commonmarker (0.17.13)
-      ruby-enum (~> 0.5)
-    concurrent-ruby (1.1.5)
-    dnsruby (1.61.3)
-      addressable (~> 2.5)
-    em-websocket (0.5.1)
+    commonmarker (0.23.10)
+    concurrent-ruby (1.2.0)
+    dnsruby (1.61.9)
+      simpleidn (~> 0.1)
+    em-websocket (0.5.3)
       eventmachine (>= 0.12.9)
-      http_parser.rb (~> 0.6.0)
-    ethon (0.12.0)
-      ffi (>= 1.3.0)
+      http_parser.rb (~> 0)
+    ethon (0.15.0)
+      ffi (>= 1.15.0)
     eventmachine (1.2.7)
-    execjs (2.7.0)
-    faraday (0.17.0)
+    execjs (2.8.1)
+    faraday (1.10.0)
+      faraday-em_http (~> 1.0)
+      faraday-em_synchrony (~> 1.0)
+      faraday-excon (~> 1.1)
+      faraday-httpclient (~> 1.0)
+      faraday-multipart (~> 1.0)
+      faraday-net_http (~> 1.0)
+      faraday-net_http_persistent (~> 1.0)
+      faraday-patron (~> 1.0)
+      faraday-rack (~> 1.0)
+      faraday-retry (~> 1.0)
+      ruby2_keywords (>= 0.0.4)
+    faraday-em_http (1.0.0)
+    faraday-em_synchrony (1.0.0)
+    faraday-excon (1.1.0)
+    faraday-httpclient (1.0.1)
+    faraday-multipart (1.0.3)
       multipart-post (>= 1.2, < 3)
-    ffi (1.11.1)
+    faraday-net_http (1.0.1)
+    faraday-net_http_persistent (1.2.0)
+    faraday-patron (1.0.0)
+    faraday-rack (1.0.0)
+    faraday-retry (1.0.3)
+    ffi (1.15.5)
     forwardable-extended (2.6.0)
     gemoji (3.0.1)
-    github-pages (201)
-      activesupport (= 4.2.11.1)
-      github-pages-health-check (= 1.16.1)
-      jekyll (= 3.8.5)
-      jekyll-avatar (= 0.6.0)
+    github-pages (226)
+      github-pages-health-check (= 1.17.9)
+      jekyll (= 3.9.2)
+      jekyll-avatar (= 0.7.0)
       jekyll-coffeescript (= 1.1.1)
-      jekyll-commonmark-ghpages (= 0.1.6)
+      jekyll-commonmark-ghpages (= 0.2.0)
       jekyll-default-layout (= 0.1.4)
-      jekyll-feed (= 0.11.0)
+      jekyll-feed (= 0.15.1)
       jekyll-gist (= 1.5.0)
-      jekyll-github-metadata (= 2.12.1)
-      jekyll-mentions (= 1.4.1)
-      jekyll-optional-front-matter (= 0.3.0)
+      jekyll-github-metadata (= 2.13.0)
+      jekyll-include-cache (= 0.2.1)
+      jekyll-mentions (= 1.6.0)
+      jekyll-optional-front-matter (= 0.3.2)
       jekyll-paginate (= 1.1.0)
-      jekyll-readme-index (= 0.2.0)
-      jekyll-redirect-from (= 0.14.0)
-      jekyll-relative-links (= 0.6.0)
-      jekyll-remote-theme (= 0.4.0)
+      jekyll-readme-index (= 0.3.0)
+      jekyll-redirect-from (= 0.16.0)
+      jekyll-relative-links (= 0.6.1)
+      jekyll-remote-theme (= 0.4.3)
       jekyll-sass-converter (= 1.5.2)
-      jekyll-seo-tag (= 2.5.0)
-      jekyll-sitemap (= 1.2.0)
-      jekyll-swiss (= 0.4.0)
-      jekyll-theme-architect (= 0.1.1)
-      jekyll-theme-cayman (= 0.1.1)
-      jekyll-theme-dinky (= 0.1.1)
-      jekyll-theme-hacker (= 0.1.1)
-      jekyll-theme-leap-day (= 0.1.1)
-      jekyll-theme-merlot (= 0.1.1)
-      jekyll-theme-midnight (= 0.1.1)
-      jekyll-theme-minimal (= 0.1.1)
-      jekyll-theme-modernist (= 0.1.1)
-      jekyll-theme-primer (= 0.5.3)
-      jekyll-theme-slate (= 0.1.1)
-      jekyll-theme-tactile (= 0.1.1)
-      jekyll-theme-time-machine (= 0.1.1)
-      jekyll-titles-from-headings (= 0.5.1)
-      jemoji (= 0.10.2)
-      kramdown (= 1.17.0)
-      liquid (= 4.0.0)
-      listen (= 3.1.5)
+      jekyll-seo-tag (= 2.8.0)
+      jekyll-sitemap (= 1.4.0)
+      jekyll-swiss (= 1.0.0)
+      jekyll-theme-architect (= 0.2.0)
+      jekyll-theme-cayman (= 0.2.0)
+      jekyll-theme-dinky (= 0.2.0)
+      jekyll-theme-hacker (= 0.2.0)
+      jekyll-theme-leap-day (= 0.2.0)
+      jekyll-theme-merlot (= 0.2.0)
+      jekyll-theme-midnight (= 0.2.0)
+      jekyll-theme-minimal (= 0.2.0)
+      jekyll-theme-modernist (= 0.2.0)
+      jekyll-theme-primer (= 0.6.0)
+      jekyll-theme-slate (= 0.2.0)
+      jekyll-theme-tactile (= 0.2.0)
+      jekyll-theme-time-machine (= 0.2.0)
+      jekyll-titles-from-headings (= 0.5.3)
+      jemoji (= 0.12.0)
+      kramdown (= 2.3.2)
+      kramdown-parser-gfm (= 1.1.0)
+      liquid (= 4.0.3)
       mercenary (~> 0.3)
-      minima (= 2.5.0)
-      nokogiri (>= 1.10.4, < 2.0)
-      rouge (= 3.11.0)
+      minima (= 2.5.1)
+      nokogiri (>= 1.13.4, < 2.0)
+      rouge (= 3.26.0)
       terminal-table (~> 1.4)
-    github-pages-health-check (1.16.1)
+    github-pages-health-check (1.17.9)
       addressable (~> 2.3)
       dnsruby (~> 1.60)
       octokit (~> 4.0)
-      public_suffix (~> 3.0)
+      public_suffix (>= 3.0, < 5.0)
       typhoeus (~> 1.3)
-    html-pipeline (2.12.0)
+    html-pipeline (2.14.1)
       activesupport (>= 2)
       nokogiri (>= 1.4)
-    http_parser.rb (0.6.0)
+    http_parser.rb (0.8.0)
     i18n (0.9.5)
       concurrent-ruby (~> 1.0)
-    jekyll (3.8.5)
+    jekyll (3.9.2)
       addressable (~> 2.4)
       colorator (~> 1.0)
       em-websocket (~> 0.5)
       i18n (~> 0.7)
       jekyll-sass-converter (~> 1.0)
       jekyll-watch (~> 2.0)
-      kramdown (~> 1.14)
+      kramdown (>= 1.17, < 3)
       liquid (~> 4.0)
       mercenary (~> 0.3.3)
       pathutil (~> 0.9)
       rouge (>= 1.7, < 4)
       safe_yaml (~> 1.0)
-    jekyll-avatar (0.6.0)
-      jekyll (~> 3.0)
+    jekyll-avatar (0.7.0)
+      jekyll (>= 3.0, < 5.0)
     jekyll-coffeescript (1.1.1)
       coffee-script (~> 2.2)
       coffee-script-source (~> 1.11.1)
-    jekyll-commonmark (1.3.1)
-      commonmarker (~> 0.14)
-      jekyll (>= 3.7, < 5.0)
-    jekyll-commonmark-ghpages (0.1.6)
-      commonmarker (~> 0.17.6)
-      jekyll-commonmark (~> 1.2)
+    jekyll-commonmark (1.4.0)
+      commonmarker (~> 0.22)
+    jekyll-commonmark-ghpages (0.2.0)
+      commonmarker (~> 0.23.4)
+      jekyll (~> 3.9.0)
+      jekyll-commonmark (~> 1.4.0)
       rouge (>= 2.0, < 4.0)
     jekyll-default-layout (0.1.4)
       jekyll (~> 3.0)
-    jekyll-feed (0.11.0)
-      jekyll (~> 3.3)
+    jekyll-feed (0.15.1)
+      jekyll (>= 3.7, < 5.0)
     jekyll-gist (1.5.0)
       octokit (~> 4.2)
-    jekyll-github-metadata (2.12.1)
-      jekyll (~> 3.4)
+    jekyll-github-metadata (2.13.0)
+      jekyll (>= 3.4, < 5.0)
       octokit (~> 4.0, != 4.4.0)
-    jekyll-mentions (1.4.1)
+    jekyll-include-cache (0.2.1)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-mentions (1.6.0)
       html-pipeline (~> 2.3)
-      jekyll (~> 3.0)
-    jekyll-optional-front-matter (0.3.0)
-      jekyll (~> 3.0)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-optional-front-matter (0.3.2)
+      jekyll (>= 3.0, < 5.0)
     jekyll-paginate (1.1.0)
-    jekyll-readme-index (0.2.0)
-      jekyll (~> 3.0)
-    jekyll-redirect-from (0.14.0)
-      jekyll (~> 3.3)
-    jekyll-relative-links (0.6.0)
-      jekyll (~> 3.3)
-    jekyll-remote-theme (0.4.0)
+    jekyll-readme-index (0.3.0)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-redirect-from (0.16.0)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-relative-links (0.6.1)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-remote-theme (0.4.3)
       addressable (~> 2.0)
-      jekyll (~> 3.5)
-      rubyzip (>= 1.2.1, < 3.0)
+      jekyll (>= 3.5, < 5.0)
+      jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0)
+      rubyzip (>= 1.3.0, < 3.0)
     jekyll-sass-converter (1.5.2)
       sass (~> 3.4)
-    jekyll-seo-tag (2.5.0)
-      jekyll (~> 3.3)
-    jekyll-sitemap (1.2.0)
-      jekyll (~> 3.3)
-    jekyll-swiss (0.4.0)
-    jekyll-theme-architect (0.1.1)
-      jekyll (~> 3.5)
+    jekyll-seo-tag (2.8.0)
+      jekyll (>= 3.8, < 5.0)
+    jekyll-sitemap (1.4.0)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-swiss (1.0.0)
+    jekyll-theme-architect (0.2.0)
+      jekyll (> 3.5, < 5.0)
       jekyll-seo-tag (~> 2.0)
-    jekyll-theme-cayman (0.1.1)
-      jekyll (~> 3.5)
+    jekyll-theme-cayman (0.2.0)
+      jekyll (> 3.5, < 5.0)
       jekyll-seo-tag (~> 2.0)
-    jekyll-theme-dinky (0.1.1)
-      jekyll (~> 3.5)
+    jekyll-theme-dinky (0.2.0)
+      jekyll (> 3.5, < 5.0)
       jekyll-seo-tag (~> 2.0)
-    jekyll-theme-hacker (0.1.1)
-      jekyll (~> 3.5)
+    jekyll-theme-hacker (0.2.0)
+      jekyll (> 3.5, < 5.0)
       jekyll-seo-tag (~> 2.0)
-    jekyll-theme-leap-day (0.1.1)
-      jekyll (~> 3.5)
+    jekyll-theme-leap-day (0.2.0)
+      jekyll (> 3.5, < 5.0)
       jekyll-seo-tag (~> 2.0)
-    jekyll-theme-merlot (0.1.1)
-      jekyll (~> 3.5)
+    jekyll-theme-merlot (0.2.0)
+      jekyll (> 3.5, < 5.0)
       jekyll-seo-tag (~> 2.0)
-    jekyll-theme-midnight (0.1.1)
-      jekyll (~> 3.5)
+    jekyll-theme-midnight (0.2.0)
+      jekyll (> 3.5, < 5.0)
       jekyll-seo-tag (~> 2.0)
-    jekyll-theme-minimal (0.1.1)
-      jekyll (~> 3.5)
+    jekyll-theme-minimal (0.2.0)
+      jekyll (> 3.5, < 5.0)
       jekyll-seo-tag (~> 2.0)
-    jekyll-theme-modernist (0.1.1)
-      jekyll (~> 3.5)
+    jekyll-theme-modernist (0.2.0)
+      jekyll (> 3.5, < 5.0)
       jekyll-seo-tag (~> 2.0)
-    jekyll-theme-primer (0.5.3)
-      jekyll (~> 3.5)
+    jekyll-theme-primer (0.6.0)
+      jekyll (> 3.5, < 5.0)
       jekyll-github-metadata (~> 2.9)
       jekyll-seo-tag (~> 2.0)
-    jekyll-theme-slate (0.1.1)
-      jekyll (~> 3.5)
+    jekyll-theme-slate (0.2.0)
+      jekyll (> 3.5, < 5.0)
       jekyll-seo-tag (~> 2.0)
-    jekyll-theme-tactile (0.1.1)
-      jekyll (~> 3.5)
+    jekyll-theme-tactile (0.2.0)
+      jekyll (> 3.5, < 5.0)
       jekyll-seo-tag (~> 2.0)
-    jekyll-theme-time-machine (0.1.1)
-      jekyll (~> 3.5)
+    jekyll-theme-time-machine (0.2.0)
+      jekyll (> 3.5, < 5.0)
       jekyll-seo-tag (~> 2.0)
-    jekyll-titles-from-headings (0.5.1)
-      jekyll (~> 3.3)
+    jekyll-titles-from-headings (0.5.3)
+      jekyll (>= 3.3, < 5.0)
     jekyll-watch (2.2.1)
       listen (~> 3.0)
-    jemoji (0.10.2)
+    jemoji (0.12.0)
       gemoji (~> 3.0)
       html-pipeline (~> 2.2)
-      jekyll (~> 3.0)
-    kramdown (1.17.0)
-    liquid (4.0.0)
-    listen (3.1.5)
-      rb-fsevent (~> 0.9, >= 0.9.4)
-      rb-inotify (~> 0.9, >= 0.9.7)
-      ruby_dep (~> 1.2)
+      jekyll (>= 3.0, < 5.0)
+    kramdown (2.3.2)
+      rexml
+    kramdown-parser-gfm (1.1.0)
+      kramdown (~> 2.0)
+    liquid (4.0.3)
+    listen (3.7.1)
+      rb-fsevent (~> 0.10, >= 0.10.3)
+      rb-inotify (~> 0.9, >= 0.9.10)
     mercenary (0.3.6)
-    mini_portile2 (2.4.0)
-    minima (2.5.0)
-      jekyll (~> 3.5)
+    minima (2.5.1)
+      jekyll (>= 3.5, < 5.0)
       jekyll-feed (~> 0.9)
       jekyll-seo-tag (~> 2.1)
-    minitest (5.12.2)
+    minitest (5.17.0)
     multipart-post (2.1.1)
-    nokogiri (1.10.8)
-      mini_portile2 (~> 2.4.0)
-    octokit (4.14.0)
+    nokogiri (1.16.5-x86_64-linux)
+      racc (~> 1.4)
+    octokit (4.22.0)
+      faraday (>= 0.9)
       sawyer (~> 0.8.0, >= 0.5.3)
     pathutil (0.16.2)
       forwardable-extended (~> 2.6)
-    public_suffix (3.1.1)
-    rb-fsevent (0.10.3)
-    rb-inotify (0.10.0)
+    public_suffix (4.0.7)
+    racc (1.7.3)
+    rb-fsevent (0.11.1)
+    rb-inotify (0.10.1)
       ffi (~> 1.0)
-    rouge (3.11.0)
-    ruby-enum (0.7.2)
-      i18n
-    ruby_dep (1.5.0)
-    rubyzip (2.0.0)
+    rexml (3.3.9)
+    rouge (3.26.0)
+    ruby2_keywords (0.0.5)
+    rubyzip (2.3.2)
     safe_yaml (1.0.5)
     sass (3.7.4)
       sass-listen (~> 4.0.0)
@@ -230,30 +256,38 @@ GEM
     sawyer (0.8.2)
       addressable (>= 2.3.5)
       faraday (> 0.8, < 2.0)
+    simpleidn (0.2.1)
+      unf (~> 0.1.4)
     terminal-table (1.8.0)
       unicode-display_width (~> 1.1, >= 1.1.1)
     thread_safe (0.3.6)
-    typhoeus (1.3.1)
+    typhoeus (1.4.0)
       ethon (>= 0.9.0)
-    tzinfo (1.2.5)
+    tzinfo (1.2.10)
       thread_safe (~> 0.1)
-    tzinfo-data (1.2019.3)
+    tzinfo-data (1.2022.1)
       tzinfo (>= 1.0.0)
-    unicode-display_width (1.6.0)
+    unf (0.1.4)
+      unf_ext
+    unf_ext (0.0.8.1)
+    unicode-display_width (1.8.0)
     wdm (0.1.1)
+    webrick (1.8.2)
+    zeitwerk (2.6.6)
 
 PLATFORMS
-  ruby
+  x86_64-linux
 
 DEPENDENCIES
-  github-pages (~> 201)
-  jekyll (~> 3.8.5)
-  jekyll-feed (~> 0.11)
+  github-pages
+  jekyll-feed
+  jekyll-paginate
   kramdown
   rouge
   tzinfo (~> 1.2)
   tzinfo-data
   wdm (~> 0.1.1)
+  webrick
 
 BUNDLED WITH
-   2.0.2
+   2.3.8
diff --git a/README.md b/README.md
index 1524214..5797b07 100644
--- a/README.md
+++ b/README.md
@@ -16,4 +16,23 @@ discussions about organization culture, collaboration, and process are welcome
 so long as they pass the bar of: "would this be interesting to somebody we
 would want to work with us?"
 
+# Local build
 
+```
+# If you don't have Ruby 2.6 installed
+# jekyll-sass-converter requires Ruby version >= 2.4.0
+# Default MacOS ruby version is 2.3
+# Ruby < 2.6 are not maintained anymore
+# Ruby 2.7 prints bunch of warnings for Jekyll < 3.8.7
+# Using Jekyll 3.8.7 requires bumping github-pages and jekyll-feed
+brew install ruby@2.6
+echo 'export PATH="/usr/local/opt/ruby@2.6/bin:$PATH"' >> "$HOME/.bash_profile"
+source "$HOME/.bash_profile"
+
+sudo gem install bundler # if you don't have bundler installed
+bundle config set path vendor
+bundle install
+
+bundle exec jekyll serve --livereload # for auto-updading
+open http://localhost:4000
+```
diff --git a/_category/core-infrastructure.md b/_category/core-infrastructure.md
new file mode 100644
index 0000000..201a9cf
--- /dev/null
+++ b/_category/core-infrastructure.md
@@ -0,0 +1,4 @@
+---
+team: Core Infrastructure
+permalink: "/blog/category/core-infrastructure"
+---
diff --git a/_category/data-science.md b/_category/data-science.md
index aa400d2..073e6db 100644
--- a/_category/data-science.md
+++ b/_category/data-science.md
@@ -1,4 +1,4 @@
 ---
-team: Data Science
+team: Applied Research
 permalink: "/blog/category/data-science"
 ---
diff --git a/_category/internal-tools.md b/_category/internal-tools.md
new file mode 100644
index 0000000..449f6fa
--- /dev/null
+++ b/_category/internal-tools.md
@@ -0,0 +1,4 @@
+---
+team: Internal Tools
+permalink: "/blog/category/internal-tools"
+---
diff --git a/_category/recommendations.md b/_category/recommendations.md
new file mode 100644
index 0000000..8e928c4
--- /dev/null
+++ b/_category/recommendations.md
@@ -0,0 +1,4 @@
+---
+team: Recommendations
+permalink: "/blog/category/recommendations"
+---
diff --git a/_category/security-engineering.md b/_category/security-engineering.md
new file mode 100644
index 0000000..e28697a
--- /dev/null
+++ b/_category/security-engineering.md
@@ -0,0 +1,4 @@
+---
+team: Security Engineering
+permalink: "/blog/category/security-engineering"
+---
diff --git a/_category/technical-project-management.md b/_category/technical-project-management.md
new file mode 100644
index 0000000..b3f386a
--- /dev/null
+++ b/_category/technical-project-management.md
@@ -0,0 +1,4 @@
+---
+team: Technical Project Management
+permalink: "/blog/category/technical-project-management"
+---
diff --git a/_config.yml b/_config.yml
index 593d014..3e577cc 100644
--- a/_config.yml
+++ b/_config.yml
@@ -7,11 +7,11 @@ description: >- # this means to ignore newlines until "baseurl:"
 baseurl: "" # the subpath of your site, e.g. /blog
 url: "https://tech.scribd.com/" # the base hostname & protocol for your site, e.g. http://example.com
 google_analytics: 'UA-443684-30'
-featured_series: 'airflow-series'
+featured_series: 'kyc-series'
 
 # GitHub Metadata
 # Used for "improve this page" link
-branch: master
+branch: main
 
 markdown: kramdown
 highlighter: rouge
diff --git a/_data/authors.yml b/_data/authors.yml
index b09b7cf..69d8120 100644
--- a/_data/authors.yml
+++ b/_data/authors.yml
@@ -3,6 +3,17 @@
 # description, etc
 ---
 
+bshaw:
+  name: Ben Shaw
+  github: benshaw
+  twitter: ben_a_shaw
+  about: |
+    Ben leads the ML Platform group, helping scale production Machine Learning at scribd. Other times you will find him outside playing in the mountains.
+
+alexjb:
+  name: Alex Bernardin
+  github: alexofmanytrades
+
 gwtrev:
   name: George Treviranus
   twitter: gwtrev
@@ -70,3 +81,99 @@ trinityx:
 qphou:
   name: QP Hou
   github: houqp
+
+jasonb:
+  name: Jason Bentley
+
+lbuschbaum:
+  name: Lori Buschbaum
+  github: lbuschbaum
+
+jimp:
+  name: Jim Park
+  github: jim80net
+
+paha:
+  name: Patrick H
+  github: glerb
+
+kamranf:
+  name: Kamran Farhadi
+  github: kamranf
+
+jasond:
+  name: Jason Davila
+  github: zherner
+
+maksymd:
+  name: Maksym Dovhal
+  github: Maks-D
+
+kuntalb:
+  name: Kuntal Kumar Basu
+  github: kuntalkumarbasu
+
+alexk:
+  name: Alex Kushnir
+  github: shtusha
+
+nakulpathak3:
+  name: Nakul Pathak
+  github: nakulpathak3
+  twitter: nakulpathak3
+  blog: https://nakulpathak3.github.io
+  about: |
+    Nakul works on Sidekiq, Terraform-ing AWS Fargate, Ruby and Rails upgrades, and monolith migrations at Scribd.
+
+ajhofmann:
+  name: Adam Hofmann
+  github: ajhofmann
+
+trupin:
+  name: Theo Rupin
+  github: trupin
+
+div:
+  name: Div Dasani
+  github: divdasani
+  about: |
+    Div is a Machine Learning Engineer on the Recommendations team, working on personalized
+    recommendations and online faceted search.
+
+gregr:
+  name: Greg Reznik
+  github: imfromthebay
+  
+  
+jonathanr:
+  name: Jonathan Ramkissoon
+  twitter: _JRamkissoon
+  github: jramkiss
+  blog: https://jramkiss.github.io/
+  about: |
+    Jonathan is a data scientist on the Applied Research team building machine learning models to understand and connect our content.
+
+antoniam:
+  name: Antonia Mouawad
+  github: AntoniaMouawad
+  about: |
+    Antonia is a data scientist on the Applied Research team building machine learning models to understand and connect our content.
+
+nathans:
+  name: Nathan Sass
+  github: NathanSass
+  about: |
+   Nathan is a software engineer on the Android platform team.
+
+rafaelp:
+  name: Rafael Lacerda
+  github: lacerda
+  blog: https://blog.lacerda.ch/
+  about: |
+    Rafael is a data scientist on the Applied Research team building machine learning models to understand and connect our content.
+
+moniquec:
+  name: Monique Alves Cruz
+  github: MAlvesCruz
+  about: |
+    Monique is a data scientist on the Applied Research team building machine learning models to understand and connect our content.
diff --git a/_data/benefits.yml b/_data/benefits.yml
index 01c8b08..6ab2250 100644
--- a/_data/benefits.yml
+++ b/_data/benefits.yml
@@ -10,7 +10,7 @@
 - name: Paid Parental Leave
   description: "100% pay for the first 6 weeks of leave for the birth, adoption, or foster placement of a child."
 
-- name: Visa Sonsorship
+- name: Visa Sponsorship
   description: "Looking to move to the U.S.? We’ll sponsor your visa to help you get settled."
 
 - name: PTO + Holiday Week
diff --git a/_data/navigation.yml b/_data/navigation.yml
index b476f06..a56b000 100644
--- a/_data/navigation.yml
+++ b/_data/navigation.yml
@@ -1,6 +1,9 @@
 - title: Blog
   url: "/blog/"
 
+- title: RSS
+  url: "/feed.xml"
+
 - title: Projects
   url: "/projects/"
 
diff --git a/_data/team-structure.yml b/_data/team-structure.yml
index 006f0e4..90f26e4 100644
--- a/_data/team-structure.yml
+++ b/_data/team-structure.yml
@@ -26,9 +26,9 @@
     about titles in our library by analyzing content and user behavior and
     building predictive models.
 
-- team: Data Science
+- team: Applied Research
   description: |
-    The Data Science team drives decisions by creating insights into the product
+    The Applied Research team drives decisions by creating insights into the product
     and improve the user experience with machine learning.
 
 - team: Core Platform
@@ -55,10 +55,11 @@
 
 - team: Core Infrastructure
   description: |
-    The Infrastructure team’s mission is to provide a high quality low-level
-    infrastructure shared between all engineering efforts. Rather than focusing
-    on supporting any individual application – including scribd.git – this team
-    focuses on infrastructure used by all, or nearly all projects at Scribd.
+    The Infrastructure team's mission is to provide secure and reliable cloud
+    infrastructure shared between all engineering efforts with a focus on efficient
+    automation and self-service. Our vision is an integrated set of standardized
+    solutions that empower service ownership by facilitating and promoting DevOps
+    practices within Scribd.
 
 - team: Security Engineering
   description: |
@@ -74,7 +75,7 @@
 - team: iOS
   description: |
     The iOS team's mission is to deliver a performant, stable and feature-rich
-    Android application.
+    iOS application.
 
 - team: Web Development
   description: |
@@ -90,3 +91,8 @@
   description: |
     The Web QA team strives for a defect-free Scribd website known for
     its reliability.
+
+- team: Service Foundations
+  description: |
+    The Service Foudations team provides reliable, high-quality, scalable service foundations 
+    that teams can leverage to easily build, deploy and monitor self-owned, distributed services.
diff --git a/_data/teams.yml b/_data/teams.yml
index a480658..56a4665 100644
--- a/_data/teams.yml
+++ b/_data/teams.yml
@@ -3,22 +3,22 @@
 # plumbing jobs into team's blog posts
 ---
 iOS:
-  # the category in Lever
-  lever: 'iOS'
+  lever: 'Mobile'
 
 Android:
-  lever: 'Android'
+  lever: 'Mobile'
+
+Applied Research:
+  lever: 'Data Science'
 
 Data Science:
-  lever: 'Data Science - San Francisco'
+  lever: 'Data Science'
 
 Core Platform:
   lever: 'Core Platform'
 
 Data Engineering:
-  # No clue why these jobs are grouped with Core Platform in Lever, but not
-  # really important to fix at the moment
-  lever: 'Core Platform'
+  lever: 'Data Engineering'
 
 Core Infrastructure:
   lever: 'Core Infrastructure'
@@ -26,6 +26,9 @@ Core Infrastructure:
 Payments:
   lever: 'Payments'
 
+Technical Project Management:
+  lever: 'Project Management'
+
 Web Development:
   lever: 'Web Development'
   about: |
@@ -38,3 +41,22 @@ Web Development:
     impact of our product development on the users. The scale of these experiments
     is also fairly large with 100+ million documents, 300+ million visitors every
     month (and growing) and 1.2 million (and growing) paid subscribers.
+
+Security Engineering:
+  lever: 'Security Engineering'
+
+Internal Tools:
+  lever: 'Internal Tools'
+
+Recommendations:
+  lever: 'Recommendations'
+  about: |
+    The Recommendations team at Scribd wants to inspire users to read more and discover
+    new content and topics. Our team comprises of Machine Learning and Software Engineers,
+    Product Managers, Data Scientists, and QA and Project Managers, all of whom have the
+    shared passion of building the world's best recommendation engine for books. We pride
+    ourselves on using a variety of open-source technologies to develop and productionize
+    state of the art machine learning solutions.
+
+IT:
+ lever: 'IT'
\ No newline at end of file
diff --git a/_includes/post-hero.html b/_includes/post-hero.html
index c35c5f4..683c1b2 100644
--- a/_includes/post-hero.html
+++ b/_includes/post-hero.html
@@ -40,6 +40,26 @@ <h1 class="hero__title" itemprop="name headline">{{ page.title | escape }}</h1>
                 </li>
                 {%- endif -%}
 
+                {%- if page.authors -%}
+                <li class="hero__meta-item author media">
+                    <div class="media-body">
+                          <strong>Author</strong><br>
+                    {%- for author in page.authors -%}
+                          <span itemprop="author" itemscope itemtype="http://schema.org/Person">
+                              <span itemprop="name">
+                                  {%- if site.data.authors[author] -%}
+                                      {{ site.data.authors[author].name}}
+                                  {%- else -%}
+                                      {{ author }}
+                                  {%- endif -%}
+                                  {% if forloop.last %}{% else %},{% endif %}
+                              </span>
+                          </span>
+                    {%- endfor -%}
+                    </div>
+                </li>
+                {%- endif -%}
+
                 <!-- Post Date -->
                 {%- if page.date -%}
                 <li class="hero__meta-item published">
diff --git a/_includes/post-sidebar.html b/_includes/post-sidebar.html
index 19f2bb1..b743007 100644
--- a/_includes/post-sidebar.html
+++ b/_includes/post-sidebar.html
@@ -4,7 +4,7 @@
     <ul class="post-sidebar__tag-list">
         <li class="post-sidebar__tag-label fw-bold">Tag {%- if page.tags.size > 1 -%}s{%- endif -%}: </li>
         {% for tag in page.tags %}
-        <li class="post-sidebar__tags">{{ tag }}</li>
+        <li class="post-sidebar__tags"><a href="http://github.com/tag/%7B%7B%20tag%20%7D%7D/">{{ tag }}</a></li>
         {% endfor %}
     </ul>
     {%- endif -%}
diff --git a/_includes/project-list.html b/_includes/project-list.html
index d477f79..3c25717 100644
--- a/_includes/project-list.html
+++ b/_includes/project-list.html
@@ -1,5 +1,5 @@
 <ul class="card-grid">
-    {% assign public_repositories = site.github.public_repositories | where:'archived', false | where:'fork', false | sort: 'stargazers_count' | reverse %}
+    {% assign public_repositories = site.github.public_repositories | where:'archived', false | where:'fork', false | where_exp: 'repo', "repo.name != 'scribd.github.io'" | where_exp: 'repo', "repo.name != '.github'" | sort: 'pushed_at' | reverse %}
 
     <!-- Assign an optional limit to be passed through  -->
     {% for repository in public_repositories limit: {{include.limit}} %}
diff --git a/_includes/related-jobs.html b/_includes/related-jobs.html
index d350d91..e6ec8f4 100644
--- a/_includes/related-jobs.html
+++ b/_includes/related-jobs.html
@@ -13,6 +13,8 @@ <h5 class="mb-3">Related Jobs
     <!--
         window.onload = () =>{
             renderJobs(document.getElementById('jobs'), "{{ site.data.teams[page.team].lever }}", 4);
+            // Always include some Engineering jobs if we have any general ones
+            renderJobs(document.getElementById('jobs'), "{{ site.data.teams['Engineering'].lever }}", 2);
         };
     -->
     </script>
diff --git a/_includes/team-color-logic.html b/_includes/team-color-logic.html
index 76dac8b..e87eb10 100644
--- a/_includes/team-color-logic.html
+++ b/_includes/team-color-logic.html
@@ -9,7 +9,7 @@
     {% elsif page.team == "Android" %}
     {% assign theme = 'grass' %}
 
-    {% elsif page.team == "Data Science" %}
+    {% elsif page.team == "Applied Research" %}
     {% assign theme = 'slate' %}
 
     {% elsif page.team == "Web Development" %}
diff --git a/_layouts/home.html b/_layouts/home.html
index b92aa95..3439c1f 100644
--- a/_layouts/home.html
+++ b/_layouts/home.html
@@ -21,7 +21,7 @@ <h2 id="featured-projects" class="section-heading pb-3">Featured Projects</h2>
         <div class="section-split__item">
             <h2 id="featured-jobs" class="section-heading text-plum pb-3">We’re Hiring</h2>
             <h3 class="h2">Help us build our next project.</h3>
-            <p class="body-lg text-length-md">We're on a mission to change the way the world reads. That's an ambitious task, and we need ambitious people to get us there. See what positions are available and start your next chapter today.</p>
+            <p class="body-lg text-length-md">We're on a mission to build the largest and most accessible library connecting storytellers with their audience. That's an ambitious task, and we need ambitious people to get us there. See what positions are available and start your next chapter today.</p>
             <a href="http://github.com/%7B%%20link%20careers.html%20%%7D#open-positions" class="mt-3 mb-3 btn btn-secondary btn-icon-right">
                 All Positions
                 <svg class="svg-icon"><use xlink:href="http://github.com/%7B%7B'/assets/images/icons/icon-sprite.svg#arrow-right' | relative_url }}"></use></svg>
diff --git a/_layouts/post-index.html b/_layouts/post-index.html
index a359617..e7856ff 100644
--- a/_layouts/post-index.html
+++ b/_layouts/post-index.html
@@ -2,8 +2,10 @@
 layout: default
 ---
 
-<!-- Featured post hero -->
-{% include featured-post-hero.html %}
+{% unless page.url contains 'tag' %}
+    <!-- Featured post hero -->
+    {% include featured-post-hero.html %}
+{% endunless %}
 
 <div class="post-index" id="{{ site.post-id | remove: "#" }}">
 
diff --git a/_layouts/tag_page.html b/_layouts/tag_page.html
new file mode 100644
index 0000000..2794ee9
--- /dev/null
+++ b/_layouts/tag_page.html
@@ -0,0 +1,10 @@
+---
+layout: post-index
+title: page.tag
+---
+
+<ul class="post-list text-length-lg">
+    {% for post in site.tags[page.tag] %}
+        {% include post-list-item.html %}
+    {% endfor %}
+</ul>
diff --git a/_posts/2018-01-05-neural-spelling-corrections.md b/_posts/2018-01-05-neural-spelling-corrections.md
index 46205af..9bbbc99 100644
--- a/_posts/2018-01-05-neural-spelling-corrections.md
+++ b/_posts/2018-01-05-neural-spelling-corrections.md
@@ -5,7 +5,7 @@ author: mattr
 tags:
 - seq2seq
 - data
-team: Data Science
+team: Applied Research
 ---
 
 Introduction
diff --git a/_posts/2018-02-12-search-query-parsing.md b/_posts/2018-02-12-search-query-parsing.md
index c3937e9..49e4498 100644
--- a/_posts/2018-02-12-search-query-parsing.md
+++ b/_posts/2018-02-12-search-query-parsing.md
@@ -5,7 +5,7 @@ author: mattr
 tags:
 - search
 - data
-team: Data Science
+team: Applied Research
 ---
 
 Scribd has a variety of content to offer and connecting our users with their desired content is a crucial aspect of our product. One of the main ways that users find content on Scribd is through search, and in this post I want to delve into an analysis we did regarding parsing out valuable information from a user’s query in order to better serve them relevant results, and also learn more about what they are searching for.
diff --git a/_posts/2018-03-20-scribds-ab-testing.md b/_posts/2018-03-20-scribds-ab-testing.md
index 2b3e3ad..59a21d0 100644
--- a/_posts/2018-03-20-scribds-ab-testing.md
+++ b/_posts/2018-03-20-scribds-ab-testing.md
@@ -5,7 +5,7 @@ author: dfeldman
 tags:
 - testing
 - data
-team: Data Science
+team: Applied Research
 ---
 
 What is A/B testing?
diff --git a/_posts/2018-04-18-bandits-for-the-win.md b/_posts/2018-04-18-bandits-for-the-win.md
index d0a0734..a70db49 100644
--- a/_posts/2018-04-18-bandits-for-the-win.md
+++ b/_posts/2018-04-18-bandits-for-the-win.md
@@ -5,7 +5,7 @@ author: dfeldman
 tags:
 - testing
 - data
-team: Data Science
+team: Applied Research
 ---
 
 We love A/B testing at Scribd. What follows is a specific example to give you an inside look at the process from idea to implementation for an algorithm test.
diff --git a/_posts/2018-05-31-non-random-seo-test.md b/_posts/2018-05-31-non-random-seo-test.md
index b115427..262d008 100644
--- a/_posts/2018-05-31-non-random-seo-test.md
+++ b/_posts/2018-05-31-non-random-seo-test.md
@@ -6,7 +6,7 @@ tags:
 - seo
 - testing
 - data
-team: Data Science
+team: Applied Research
 ---
 
 Months ago, your friends convinced you to sign up for a half marathon. With three weeks to go, you haven’t even started training. In a growing panic, you turn to the internet for answers.
diff --git a/_posts/2019-02-07-calculating-customer-lifetime-revenue.md b/_posts/2019-02-07-calculating-customer-lifetime-revenue.md
index cbb708a..d4c679e 100644
--- a/_posts/2019-02-07-calculating-customer-lifetime-revenue.md
+++ b/_posts/2019-02-07-calculating-customer-lifetime-revenue.md
@@ -5,7 +5,7 @@ author: bclearly
 tags:
 - ltr
 - data
-team: Data Science
+team: Applied Research
 ---
 
 Why LTR? (Lifetime Revenue)
diff --git a/_posts/2019-03-04-experiments-with-seq2seq.md b/_posts/2019-03-04-experiments-with-seq2seq.md
index 8f3beac..ff10bb4 100644
--- a/_posts/2019-03-04-experiments-with-seq2seq.md
+++ b/_posts/2019-03-04-experiments-with-seq2seq.md
@@ -6,7 +6,7 @@ tags:
 - machinelearning
 - seq2seq
 - data
-team: Data Science
+team: Applied Research
 ---
 
 How much data do you need to train a seq2seq model? Let’s say that you want to translate sentences from one language to another. You probably need a bigger dataset to translate longer sentences than if you wanted to translate shorter ones. How does the need for data grow as the sentence length increases?
diff --git a/_posts/2019-08-28-real-time-data-platform.md b/_posts/2019-08-28-real-time-data-platform.md
index 8145505..0610571 100644
--- a/_posts/2019-08-28-real-time-data-platform.md
+++ b/_posts/2019-08-28-real-time-data-platform.md
@@ -6,6 +6,7 @@ tags:
 - kafka
 - aws
 - data
+- real-time
 team: Core Platform
 ---
 
diff --git a/_posts/2019-12-03-managing-pagerduty-rotations.md b/_posts/2019-12-03-managing-pagerduty-rotations.md
index 3959270..9ddf3e5 100644
--- a/_posts/2019-12-03-managing-pagerduty-rotations.md
+++ b/_posts/2019-12-03-managing-pagerduty-rotations.md
@@ -5,6 +5,7 @@ author: hamiltonh
 tags:
 - oncall
 - pagerduty
+- monitoring
 - incident response
 team: Core Platform
 ---
diff --git a/_posts/2020-02-20-pagerduty-at-scribd.md b/_posts/2020-02-20-pagerduty-at-scribd.md
index 6a003c9..89b19f4 100644
--- a/_posts/2020-02-20-pagerduty-at-scribd.md
+++ b/_posts/2020-02-20-pagerduty-at-scribd.md
@@ -4,6 +4,7 @@ title:  "A testimonial for using PagerDuty at Scribd"
 author: rtyler
 tags:
 - pagerduty
+- monitoring
 - oncall
 - incident response
 team: Core Platform
diff --git a/_posts/2020-03-02-breaking-up-the-dag-repo.md b/_posts/2020-03-02-breaking-up-the-dag-repo.md
index e2f30e8..ffb8341 100644
--- a/_posts/2020-03-02-breaking-up-the-dag-repo.md
+++ b/_posts/2020-03-02-breaking-up-the-dag-repo.md
@@ -56,7 +56,7 @@ store, but in order to support our desired multi-repo approach DAGs, we needed
 to build our own tooling to coordinate synchronizing the local DAG store with
 S3 objects from the multiple DAG repositories.
 
-The tool we build, [objinsync](https://github.com/scribd/objinsync) [^1], is a
+The tool we built, [objinsync](https://github.com/scribd/objinsync) [^1], is a
 stateless DAG sync daemon, which is deployed as a sidecar container. From
 Airflow’s point of view, the DAG folder is just a magical local folder that
 always contains the up to date DAG definitions assembled from multiple Git
@@ -106,12 +106,12 @@ environment variables injected by the CI/CD system.
 Our airflow clusters are orchestrated using both ECS fargate and EKS. ECS is
 used to run Airflow web server and scheduler while EKS is what’s powering
 Airflow’s Kubernetes executor. Due to differences in different Airflow
-components, we need to run `objinsync` binary in two container orchestration
+components, we need to run the `objinsync` binary in two container orchestration
 platforms with slightly different setups.
 
 For daemon Airflow components like web server and scheduler, we run
 `objinsync` in a continuous sync mode where it pulls incremental updates from
-S3 to local filesystem every 5 seconds. This is implemented using sidecar
+S3 to local filesystem every 5 seconds. This is implemented using the sidecar
 container pattern. The DAG folder is mounted as a shared volume between the
 Airflow web/scheduler container and objinsync container. The sidecar
 objinsync container is setup to run the following command:
@@ -157,7 +157,7 @@ primitives from the Go runtime for better performance.
 Engineering is all about making the right trade-offs. I won’t claim what we have
 is the perfect solution for everyone, but I do believe it strikes a good
 balance between productivity, operability, and availability. If you have any
-question regarding the setup, I am available in Airflow’s
+questions regarding the setup, I am available in Airflow’s
 [#airflow-creative](https://apache-airflow.slack.com/messages/airflow-creative)
 slack channel under the handle "QP." If you are not already part of the Airflow
 Slack community, you can get access via
diff --git a/_posts/2020-03-19-orgwide-ecr.md b/_posts/2020-03-19-orgwide-ecr.md
new file mode 100644
index 0000000..52494cd
--- /dev/null
+++ b/_posts/2020-03-19-orgwide-ecr.md
@@ -0,0 +1,134 @@
+---
+layout: post
+title: "Easy read-only ECR access for the entire AWS Organization"
+tags:
+- featured
+- aws
+- ecr
+- iam
+- docker
+team: Core Platform
+author: rtyler
+---
+
+IAM is a **very** powerful tool. It can also be very complex, and difficult to
+use effectively. In our migration into AWS a number of Scribd developers have
+had varying levels of success in climbing Mount IAM. For some use-cases where a
+resource needs to be accessed across an AWS Account boundary, the steeper
+learning curve has proven far too challenging for some, myself included.
+
+We heavily rely on an AWS Organization and a hierarchy of AWS Accounts
+to help us separate billing and provide a hard-separation between some
+classes of resources. On the whole, I think this approach has been valuable
+but when trying to manage resources which are _shared_ across
+the Organization, our initial IAM/Role efforts have left us quite frustrated.
+
+One example of a resource we frequently require shared access to are
+our Elastic Container Registries (ECR).
+The Core Platform team has ECRs to host Docker containers which can and
+should be consumed by other teams and resources in their AWS Accounts. Not
+only that, we also need to access our own containers from different accounts.
+As a matter of habit, anything "production", we deploy in our "production"
+Account, with strong access control policies and security, such as read-only
+access to the AWS Console. We do our normal development and iteration in a
+"development" Account, which may be host to any number of AWS Elastic Container
+Services (ECS), each needing to pull containers from those ECRs.
+
+Even within a single team, we're using multiple AWS Accounts, and have
+cross-account IAM policies to implement!
+
+I recently watched a demo in a team meeting from my colleague
+[QP](https://github.com/houqp) who was setting up IAM cross-account Roles.
+Based on his demo, I knew that getting the cross-account Roles correct for our
+ECR use-cases was going to be tedious and painful. I lamented this to our
+friends at [The Duckbill Group](https://www.duckbillgroup.com/), as I usually
+do whenever something in AWS feels unpleasant. "Surely I'm missing something
+here." Luckily enough, I was missing something:
+
+```
+ rtyler | I'm assuming there's no feature I'm missing which would allow us to say "any resource in our AWS org can access this", I
+        | kind of really want a global read-only access for some ECRs :/
+ rtyler | is there an arn shortcut for "whole org" perhaps?
+cquinn* | Yes, the AWS:PrincipalOrgID Condition Key.
+cquinn* | https://aws.amazon.com/blogs/security/control-access-to-aws-resources-by-using-the-aws-organization-of-iam-principals/
+        | goes into some depth.
+ rtyler | oh god, turing complete JSON
+cquinn* | Cheer up, I’m sure it works in YAML.
+```
+
+Thankfully, [Corey](https://twitter.com/QuinnyPig/) was 100% correct, the
+`AWS:PrincipalOrgID` condition in the IAM policy document would allow the exact
+type of quasi-global read-only access I was after. Below is a snippet of
+Terraform which defines the policy:
+
+```terraform
+data "aws_iam_policy_document" "ecr_readonly_access" {
+  statement {
+    sid    = "ReadonlyAccess"
+    effect = "Allow"
+
+    principals {
+      type        = "*"
+      identifiers = ["*"]
+    }
+
+    condition {
+      test     = "StringLike"
+      variable = "aws:PrincipalOrgID"
+      # This is our organization-wide identifier which can be found after
+      # log-in to AWS: <https://console.aws.amazon.com/organizations/home>
+      values = ["o-REDACTED"]
+    }
+
+    actions = [
+      "ecr:GetAuthorizationToken",
+      "ecr:BatchCheckLayerAvailability",
+      "ecr:GetDownloadUrlForLayer",
+      "ecr:GetRepositoryPolicy",
+      "ecr:DescribeRepositories",
+      "ecr:ListImages",
+      "ecr:DescribeImages",
+      "ecr:BatchGetImage",
+      "ecr:DescribeImageScanFindings",
+    ]
+  }
+}
+```
+
+With the above policy applied via the `aws_ecr_repository_policy` resource to
+our production ECRs, developers across the company can now access our
+containers in their CodeBuild, ECS, EKS, and other AWS-based resources without
+problem!
+
+```terraform
+data "aws_iam_policy_document" "ecr_access" {
+  source_json   = data.aws_iam_policy_document.ecr_readonly_access.json
+  # The ecr_full_access policy is another policy document resource with more
+  # ARNs for roles and resources which can push to ECR
+  override_json = data.aws_iam_policy_document.ecr_full_access.json
+}
+
+resource "aws_ecr_repository_policy" "ecr" {
+  repository = aws_ecr_repository.some_ecr.name
+  policy     = data.aws_iam_policy_document.ecr_access.json
+}
+```
+
+**Note:** _Our Terraform snippets have been adapted from [this great Cloud Posse
+module](https://github.com/cloudposse/terraform-aws-ecr)_.
+
+
+
+The great thing about migrating to AWS in 2020, is that just about all simple
+challenges have already been figured out, and if you have a partner like
+The Duckbill Group, it's very easy to avoid over-engineering and unnecessary
+complex solutions!
+
+---
+
+**Update:** My colleague [Fotos](https://github.com/fotos) shared in an
+internal channel that  `AWS:PrincipalOrgID` works only for AWS services that
+support [resource based
+policies](https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_aws-services-that-work-with-iam.html)
+such as S3, ECR, etc, but not ELBs.
+
diff --git a/_posts/2020-03-20-data-hygiene-while-wfh.md b/_posts/2020-03-20-data-hygiene-while-wfh.md
new file mode 100644
index 0000000..dcc3313
--- /dev/null
+++ b/_posts/2020-03-20-data-hygiene-while-wfh.md
@@ -0,0 +1,62 @@
+---
+layout: post
+title: "Data hygiene while working from home"
+tags:
+- remote
+- featured
+- security
+author: jasonb
+---
+
+Most of us are new to full-time telework and the challenges that come along
+with a 10-foot commute. I'm here to share some tips on ways you can help to
+keep business sensitive data secure outside the office.
+
+The EU has already warned the business community that data protection
+authorities will carefully monitor how businesses handle the public's personal
+data during COVID-19. It's also only a matter of time before bad actors use
+this opportunity to exploit vulnerabilities at companies that are in transition
+to telework. Sadly, there's no such thing as data Purell, but there are ways to
+mitigate the dangers.
+
+Here are a few data hygiene suggestions:
+
+* Verify that your home network is encrypted and password-protected. Lock
+  it down if it isn't.
+* Use your VPN to conduct business. You should not expect
+  third-party VPNs to provide an adequate level of security necessary to
+  conduct company business or transmit personal information.
+* Never conduct official business on unencrypted or open networks.
+* Use only official corporate equipment. Do not use personal home
+  computers and laptops.
+* Do not store company intellectual property or personal information on
+  portable storage media, including portable hard drives and USB drives.
+* Use extra caution when using personal mobile devices, such as phones and
+  iPads, in order to conduct business. If you do so, use only encrypted apps in
+  use and officially supported at your company (e.g. Slack, Zendesk, Gmail,
+  etc). Do not transmit intellectual property or personal data over open
+  browsers or networks.
+* Do not use unofficial third-party services to conduct business
+  (such as free internet scanning/faxing, etc).
+* Set a locking screen saver and set it whenever you leave your desk.
+  Nearly all of modern Macs are capable of unlocking a locked screen saver
+  with a fingerprint.
+* Observe your surroundings. Make sure that you are conducting meetings in
+  private spaces and that audio is not drifting out open windows.
+* Use long passwords and an encrypted password management system
+  ([1Password](https://1password.com) is my favorite system).
+* Watch your tabs when taking screenshots, they can often reveal sensitive
+  (or embarrassing!) information.
+
+
+Separately, I recommend switching your web browser to
+[Brave](https://brave.com), a more secure version of Chrome that blocks most
+trackers and sensors by default. Brave supports nearly all Chrome extensions
+and themes, so you should be able to replicate your Chrome environment without
+much hassle. I've found that Brave's security tech speeds up several in-browser
+business apps (Zendesk especially).
+
+These steps, combined with heightened awareness and common sense, should help
+to keep your company and your users' data protected during this crisis.
+
+Good luck, and stay safe!
diff --git a/_posts/2020-03-23-working-remote-guidelines.md b/_posts/2020-03-23-working-remote-guidelines.md
new file mode 100644
index 0000000..883d889
--- /dev/null
+++ b/_posts/2020-03-23-working-remote-guidelines.md
@@ -0,0 +1,78 @@
+---
+layout: post
+title: "Guidelines and expectations for working remotely"
+tags:
+- remote
+- featured
+author: alexjb
+team: Technical Project Management
+---
+
+Scribd has had some remote teams for almost a decade but only recently have we
+needed to be 100% remote which ended up being a very new concept for many
+employees. With our teams working from different locations all over the world,
+we rely heavily on video conferencing and Slack to keep team members connected
+and collaborating effectively.  Over time, we have identified a variety of best
+practices for folks who are working ‘remotely’ - meaning not in the room with
+their colleagues. 
+
+## With the Team
+
+* Talk about it! Don’t assume that everyone works best the same way. Have
+  discussions with your team about what works well for you and
+  what’s challenging. Some teams may have established more explicit
+  guidelines already.
+* Set expectations about how people can get your attention - Slack tags, direct
+  mentions, email, etc 
+* Check-in periodically about what’s working, what should change
+
+## On Slack
+
+* Respond promptly during working hours, even if only to let people know that you’ll get back to them.
+* Use ‘Away’ and ‘Do Not Disturb’ functions to indicate when you’re unavailable, such as on PTO/vacation or outside your working hours.
+* Use the [Google Calendar/Slack plugin](https://slack.com/help/articles/206329808-Google-Calendar-for-Slack) if you want Slack to automatically indicate when you’re in meetings.
+* Some groups have expectations to know when people are on lunch or otherwise
+  away from keyboard for a while, so check in with your team(s) and negotiate.
+
+## On Calendar
+
+* Set your working hours through [this
+    setting](https://support.google.com/calendar/answer/7638168?hl=en) in Google Calendar.
+* When you are out of the home-office, block your work calendar. This helps
+  people understand when you are and are not available for collaboration.
+
+## On video
+
+* **Have a solid connection**
+    * Be mindful of your bandwidth, computer processor, etc
+    * Internet connections/bandwidth in shared spaces are unreliable; be mindful about taking meetings in new/untested environments 
+* **Be visible**
+    * Light your face! Pay attention to if you’re backlit and look like a silhouette 
+    * Video on, unless there are connection problems, or another really good reason
+* **Be audible**
+    * Invest in a good microphone / headphones
+    * Mute if there are problematic background noises 
+* **Be present**
+    * Avoid multitasking
+    * Speak up / contribute to discussion
+
+
+## Timezones
+
+* Meetings like All Hands are recorded and can be watched on demand; usually
+  for a couple of days afterwards.
+* Your coworkers will sometimes make mistakes when scheduling meetings! Always
+  assume best intentions, and be vocal in requesting rescheduling when they
+  fall outside your working hours.
+* Prepare for asynchronous work communications, ensure you have unblocked type
+  of work when the other office is off hours.
+* Prepare for others to be ending their day before or after yours.  Set alerts
+  to ensure you have time to communicate in advance. 
+
+
+---
+
+Across the industry there are those with plenty of remote-work experience
+sharing their tips. I encourage you to consider them all, discuss with your
+team, and piece together what practices are going to help you stay effective
+while we all stay home.
diff --git a/_posts/2020-03-24-introducing-kafka-player.md b/_posts/2020-03-24-introducing-kafka-player.md
new file mode 100644
index 0000000..fb87c79
--- /dev/null
+++ b/_posts/2020-03-24-introducing-kafka-player.md
@@ -0,0 +1,185 @@
+---
+layout: post
+title: "Streaming Development Work with Kafka"
+author: christianw
+tags:
+- featured
+- kafka
+- msk-series
+team: Core Platform
+---
+
+We are using
+[Apache Kafka](https://kafka.apache.org)
+as the heart of more streaming applications which presents interesting
+design and development challenges: where does the data producer's
+responsibilities end, and the consumer's begin? How coupled should they be? And
+of course, can we accelerate development by building them in parallel? To help
+address these questions, we treat our Kafka topics like APIs rather than cogs
+in a single pipeline.  Like RESTful API endpoints, Kafka topics create a
+natural seam between development of the producer and that of the consumer As
+long as producer and consumer developers both agree on the API contract up
+front, development can run in parallel. In this post I will share our
+stream-oriented development approach and the open source utility we developed to
+make it easier: [kafka-player](https://github.com/scribd/kafka-player).
+
+
+## APIs for Topics
+
+Perhaps the most important part of developing producers and consumers in
+parallel is creating those "API contracts." Kafka topics provide a persistent
+storage medium where producer applications write data to be consumed by other
+applications. To consume data from a topic appropriately, applications must
+know how to deserialize the messages stored on these topics.  We formalize the
+data formats shared by producers and consumers by defining message schemas for
+each of our topics. We happen to serialize our messages as JSON, and we use a
+yaml formatting of [JSON schema](https://json-schema.org/) to formalize our
+shared schema definitions, giving us the necessary API contract to enforce
+between producer and consumer.
+
+The snippet below shows a general version of what one of these message schemas look like in yaml.
+This comes from one of our real schemas but with all of the interesting fields removed.
+
+```yaml
+---
+"$schema": http://json-schema.org/draft-07/schema#
+"$id": some-topic/v1.yml
+
+title: Some Topic
+description:  Event sent by ...
+type: object
+properties:
+  meta:
+    description: Metadata fields added by systems processing the message.
+    type: object
+    properties:
+      schema:
+        description: The message-schema written to this topic by producers.
+        type: string
+      producer:
+        description: Metadata fields populated by the producer.
+        type: object
+        properties:
+          application_version:
+            description: The version of the producer application.
+            type: string
+          timestamp:
+            description: An ISO 8601 formatted date-time with offset representing the time when the event was handled by the producer. E.g. 2020-01-01T15:59:60-08:00.
+            type: string
+            format: date-time
+        required:
+          - version
+          - timestamp
+    required:
+      - producer
+  uuid:
+    description: A unique identifier for the event.
+    type: string
+  user_agent:
+    description: The user agent header of the request.
+    type: string
+
+  # ... - All the other fields
+
+required:
+  - meta
+  - uuid
+```
+
+And an example message that satisfies this schema would look something like:
+
+```json
+{
+  "meta": {
+    "schema": "some-topic/v1.yaml",
+    "producer": {
+      "application_version": "v1.0.1",
+      "timestamp": "2020-01-01T15:59:60-08:00"
+    }
+  },
+  "uuid": "3a0178fb-43e7-4340-9e47-9560b7962755",
+  "user_agent": ""
+}
+```
+
+With the API contract defined, we're one step closer to decoupled development
+of producer and consumer!
+
+
+## Working with a Streaming Pipeline
+
+For many of our new streaming pipelines, a Kafka topic is the first data sink in the pipeline.
+The next application in the pipeline reads the topic as a streaming source, performs some transformations
+and sinks to a [Databricks Delta Lake table](https://databricks.com/product/delta-lake-on-databricks).
+
+![Streaming Pipeline](/post-images/2020-03-kafka-series/kafka-player-flow.png)
+
+
+The effort to build applications on each side of those topics may be
+significant.  In one of our recent projects, we had to implement a new Docker
+image, provision a number of new AWS cloud resources, and do a considerable
+amount of cross-team coordination before the producer could even be deployed to
+production. The work on the consumer side was also quite significant as we had
+to implement a number of Spark Structured Streaming jobs downstream from the
+topic.
+
+Naturally we didn't want to block the development of the consumer-side while we
+waited for all the producer-side changes to be implemented and delivered.
+
+## Dividing Labor
+
+Fortunately, since we define our message schemas beforehand, we know exactly
+what the data should look like on any given topic. In that recent project, we
+were able to generate a large file containing new-line delimited JSON and then
+do full integration testing of all components downstream of the Kafka topic
+before any production data had actually been written by the producer.
+
+To help with this, we built a very simple tool called [kafka-player](https://github.com/scribd/kafka-player) [^1].
+All `kafka-player` does is play a file, line-by-line onto a target Kafka topic.
+It provides a couple of options that make this slightly more flexible than just
+piping the file to `kafkacat`. Most notably the ability to control
+message rate.
+
+When we were just getting started in the local development of our Spark applications, we pointed
+`kafka-player` to a local Kafka Docker container and set message rate
+very low (i.e. one message every two seconds), so we could watch transformations and aggregations
+flow through the streams and build confidence in the business logic we were implementing.
+After we nailed the business logic and deployed our Spark applications, we pointed `kafka-player` at our
+development [MSK Cluster](https://aws.amazon.com/msk/) and cranked up the message rate
+to various thresholds so we could watch the impact on our Spark job resources.
+
+## Future Extensions
+
+Controlling message rate has been a very useful feature of `kafka-player` for us already, but the
+other nice thing about having `kafka-player` in our shared toolbox is that we have a hook in place where we can
+build in new capabilities as new needs arise.
+
+For our recent projects, we have been able to generate files
+representing our message schemas pretty easily so it made sense to keep the tool
+as simple as possible, but this might not always be the case.
+As we mature in our usage of JSON schema and encounter cases where generating a large file representing our
+schemas is impractical, we may find it useful to enhance `kafka-player` so that it can generate random data
+according to a message schema.
+
+Deployment tooling may also be on the horizon for `kafka-player`. The level of integration testing
+we've achieved so far is helpful, but with an image and some container configuration,
+we could push multiple instances of `kafka-player` writing to the same topic to a container
+service and create enough traffic to push our downstream consumers to their breaking points.
+
+
+---
+
+
+Most of our data workloads at Scribd are still very batch-oriented, but
+streaming applications are already showing incredible potential. The ability to
+process, aggregate, and join data in real-time has already opened up avenues
+for our product and engineering teams. As we increase the amount of data which
+is streamed, I will look forward to sharing more of the tools, tips, and tricks
+we're adopting to move Scribd engineering into a more "real-time" world!
+
+
+---
+
+
+
+[^1]: kafka-player is a simple utility for playing a text file onto a Kafka topic that has been [open sourced](https://github.com/scribd/kafka-player) by Scribd.
diff --git a/_posts/2020-03-25-reclaiming-story-points.md b/_posts/2020-03-25-reclaiming-story-points.md
new file mode 100644
index 0000000..f01828d
--- /dev/null
+++ b/_posts/2020-03-25-reclaiming-story-points.md
@@ -0,0 +1,107 @@
+---
+layout: post
+title: "Reclaiming Story Points"
+tags:
+- featured
+- agile
+team: Technical Project Management
+author: lbuschbaum
+---
+
+Scribd has been in an agile transition for two years now and as we iterate and
+improve with company growth we have needed to reevaluate a number of practices
+such as utilizing **story points**.
+
+[Story points](https://agilefaq.wordpress.com/2007/11/13/what-is-a-story-point/) in agile are a way of estimating how much work something is. They
+are deliberately _not_ exact functions of difficulty and time. You will never
+know all the complications when you start a project. Even something as simple
+as making dinner can get tripped up by a missing ingredient or a dirty pot, so
+too can software projects. Our first foray into story points was done with a
+small group of folks who had a good understanding of the practice and its
+implementation. When rolling story points out across the broader organization
+we made a couple mistakes, namely that we didn't socialize that change both
+across the organization and _upwards_.
+
+It can be very easy to say "oh, that's 8 points" in a planning meeting, but
+that's hard to translate into a meeting where we cover **all** the projects
+currently in flight, especially when trying to arrive at a target and actual
+done date for the project or task. We had a tendency to get attached
+to those dates, making plans across the organization. Then when a team would
+slip on that translated schedule, everybody would get upset!
+
+The team would be upset because they _knew_ that "8 points" was an estimate
+that accommodated their internal understanding of a period of time. Management
+would be upset because things slipped from their translated schedule which was
+itself based on a global translation of story points into time.
+
+<center>
+<img src="http://github.com/post-images/2020-03-story-points/stop.png" alt="Stop!" align="center"/>
+(<em><a href="https://publicdomainvectors.org/en/free-clipart/Stop-speech-bubble/82516.html" target="_blank">source</a></em>)
+</center>
+
+So..story points got banned from the management meeting. Project managers could
+then only speak to deadline dates and added language that mentioned things like
+'best estimate' and 'could slip' to hedge around the fact that software
+development isn't an exact science. We still talked in story points in our team
+meetings but it was a verboten term outside that space which led to its own
+tensions.
+
+One of the other reasons story points got banned was the nature of the
+imprecision. _Yes_, points should be specific to the team but our velocity was
+completely unpredictable from team to team. Sometimes it would be 20 points and
+others 50. Regression passes were sometimes included and pointed, sometimes
+they weren't. It started with a team of more junior developers and QA that had
+one of the aforementioned velocity issues. We asked them how big a 3/5/8 was
+and got a different answer from each of them. We had found the underlying
+problem that had given story points a bad name!
+
+It was time to go back to basics. We had a "story pointing workshop" with that
+team. The team already had strong communication and had a safe space, a
+retrospective, to talk through the issue of why their understanding was all
+over the map.
+Some of it was because
+they were more junior and were less likely to know where the problems were in
+the code base, but some of it was because we had just assumed that everyone knew
+what a 5 might entail. An hour later we had a white-board covered in notes with
+items under each number in the Fibonnaci sequence. Items that included things
+from each of the developers, QA and the squad lead (in this case a senior
+developer). We did it again with the entire mobile QA team, sharing
+some of the findings after we first did the brainstorm fresh, sharing where
+that team had seen points falling. It turned into a wiki page that was shared
+within the project management team, and it spread from there. 
+
+<center>
+<img src="http://github.com/post-images/2020-03-story-points/efforts.png" alt="Different efforts are different!" align="center"/>
+(<em><a href="https://www.slideshare.net/arkanaan/agile-relative-sizing-v2" target="_blank">source</a></em>)
+</center>
+
+We were clear throughout this process that points were still team specific and
+that none of this was to be taken as hard and fast rules but it gave teams a
+place in which to start the conversation and have common ground. We don't want
+to make it sound like all the teams were terrible at making story point
+estimates, or bad having reliable velocity, but the variability often exceeded
+10%. Larger variance made it hard for management to show trust in our date
+estimates. With some common guidelines, it became easier for teams to have more
+accuracy, which led to more trust in deadlines, which ultimately gave us a way
+to talk about story points again.
+
+When the deadlines shifted to being reliable within a day or two, the
+conversation wasn't as charged since somebody could show the 
+good breakdown on the
+work, which was also reflected in Jira. We operate a project lifecycle that
+starts with a product brief, goes through design iterations, and then goes into
+story breakdown and sizing. Only after those steps are done do we "put hands on
+keyboard" and start writing software, usually with pretty solid estimates. **It
+turns out people really do need time to think through the problem before
+solving it!**
+
+Our approach isn't perfect of course. We still have spots of tech debt and
+brittle code. We will always have people who under or over estimate work, but
+that's why we use story points.
+
+The initial reaction to story points was justifiable, but we continued to
+iterate on the problems we ran into with the original implementation of story
+points. Finally, by bringing up the concept of "velocity" and demonstrating how teams
+were getting more reliable in their estimates with story points, we were able
+to show management that the method was worth trusting.
+
diff --git a/_posts/2020-04-27-managing-datadog-aws-with-terraform.md b/_posts/2020-04-27-managing-datadog-aws-with-terraform.md
new file mode 100644
index 0000000..e451abb
--- /dev/null
+++ b/_posts/2020-04-27-managing-datadog-aws-with-terraform.md
@@ -0,0 +1,111 @@
+---
+layout: post
+title: "Using Terraform to integrate Datadog and AWS"
+authors:
+- jimp
+- qphou
+tags:
+- featured
+- terraform
+- monitoring
+team: Core Infrastructure
+---
+
+We love metrics but hate manual processes. When we adopted
+[Datadog](https://datadoghq.com)'s builtin AWS
+[integration](https://docs.datadoghq.com/integrations/amazon_web_services/?tab=allpermissions)
+we couldn't wait to get AWS CloudWatch metrics into Datadog, but first we needed to automate
+the [numerous manual steps
+required](https://docs.datadoghq.com/integrations/amazon_web_services/?tab=allpermissions)
+to set it up. Datadog's AWS integration is quite powerful, once
+enabled it will automatically synchronize specified CloudWatch metrics into a
+Datadog account. Basically, anything available within CloudWatch, can be easily
+made available in Datadog, alongside all of our other metrics and dashboards.
+
+
+Despite the integration's power and convenience, its setup process is actually
+quite involved. As outlined in [Datadog's
+documentation](https://docs.datadoghq.com/integrations/amazon_web_services/?tab=allpermissions),
+there are *18 manual steps* required, including:
+
+- finding the right AWS account ID
+- creating the right IAM policy
+- copy pasting the right AWS resource ID into Datadog UI
+
+If you have more than a few AWS accounts like we do, you may prefer to automate this! In our case, that means using [Terraform](https://terraform.io).
+
+In this blog post, we would like to share how Scribd uses Terraform to automate
+our Datadog and AWS integration across the organization.
+
+# Enable Datadog’s builtin AWS integration
+
+To address this problem, we built the [terraform-aws-datadog
+module](https://github.com/scribd/terraform-aws-datadog). With only couple
+lines of HCL code, Terraform will perform all the necessary steps to setup
+Datadog integration with a specific AWS account with Scribd’s best practices:
+
+```terraform
+module "datadog" {
+  source                = "git::https://github.com/scribd/terraform-aws-datadog.git?ref=master"
+  aws_account_id        = data.aws_caller_identity.current.account_id
+  datadog_api_key       = var.datadog_api_key
+  env                   = "prod"
+  namespace             = "team_foo"
+}
+```
+
+The benefit from an AWS Account maintainer point of view is that using the
+module is a convenient way to inherit centralized best practice. For module
+maintainers, any change to the Datadog integration module can be released using
+a [standard Terraform module release process](https://www.terraform.io/docs/registry/modules/publish.html).
+
+
+# CloudWatch log synchronization
+
+Initially, the module only sets up the base integration. As adoption increased, more
+features were added to the module by various teams. One of these features is
+automation for setting up log ingestion for CloudWatch.
+
+Like setting up the official AWS integration app, the [instructions for log
+synchronization](https://docs.datadoghq.com/integrations/amazon_web_services/?tab=allpermissions#log-collection)
+are a bit overwhelming.
+
+However, using the `terraform-aws-datadog` module, we can enable the feature with a single parameter:
+
+```terraform
+module "datadog" {
+  source                = "git::https://github.com/scribd/terraform-aws-datadog.git?ref=master"
+  datadog_api_key       = var.datadog_api_key
+  env                   = "prod"
+  namespace             = "project_foo"
+  cloudwatch_log_groups = ["cloudwatch_log_group_1", "cloudwatch_log_group_2"]
+}
+```
+
+That’s it! Terraform will automatically create the Datadog serverless function
+and triggers for specified log groups to forward all CloudWatch logs into
+Datadog. After running `terraform apply`, you should be able to see logs showing
+up in Datadog within minutes.
+
+
+# Future work
+
+With both metrics and logs synchronized into Datadog, we are able to leverage
+Datadog as the central hub for all things monitoring. We are planning to bring
+more features to the module as we migrate Scribd’s infrastructure into AWS.
+
+Metrics ingested through the official AWS integration are delayed by couple
+minutes, which is not ideal to use as signals for monitoring critical systems.
+There are opportunities to enable real time metrics synchronization by
+automating Datadog agent setup.
+
+The [datadog-serverless-functions
+repo](https://github.com/DataDog/datadog-serverless-functions/tree/master/aws)
+contains two other lambda based AWS augmentations that we may add as available
+features of the module: `vpc_flow_log_monitoring` and `rds_enhanced_monitoring`.
+
+Stay apprised of future releases by watching our [release page](https://github.com/scribd/terraform-aws-datadog/releases).
+
+_Special shout out to Taylor McClure and Hamilton Hord for starting the project, as well
+as Sai Kiran Burle, Kamran Farhadi and Eugene Pimenov for improvements and bug
+fixes._
diff --git a/_posts/2020-04-28-elasticache-slowlog-metrics-for-datadog.md b/_posts/2020-04-28-elasticache-slowlog-metrics-for-datadog.md
new file mode 100644
index 0000000..c563eec
--- /dev/null
+++ b/_posts/2020-04-28-elasticache-slowlog-metrics-for-datadog.md
@@ -0,0 +1,96 @@
+---
+layout: post
+title: "Sending ElastiCache slowlog metrics to Datadog"
+authors:
+- jimp
+tags:
+- terraform
+- elasticache
+- aws
+- monitoring
+team: Core Infrastructure
+---
+
+All managed services will have trade-offs. When Scribd adopted AWS ElastiCache we
+could no longer use Datadog's excellent [Redis
+integration](https://docs.datadoghq.com/integrations/redisdb/)
+and lost some killer metrics we couldn't live without.
+We deployed the [AWS ElastiCache
+integration](https://docs.datadoghq.com/integrations/amazon_elasticache/#overview)
+for Datadog which returned the desired metrics back to our dashbards
+with one notable exception: "slowlog" metrics.
+
+The Redis [`SLOWLOG`](https://redis.io/commands/slowlog) is used to help identify queries
+which are taking too long to execute. We use the slowlog metrics provided by the
+Datadog Redis integration alert us when a Redis server's behavior starts to go
+south, a key indicator of looming user-impactful production issues.
+
+Since AWS ElastiCache is a managed service, we obviously cannot deploy a
+Datadog agent onto AWS' servers to run the Datadog Redis integration. The
+approach we have taken, which we have now open sourced, is to use AWS Lambda to
+periodically query our ElastiCache Redis instances and submit the missing
+slowlog metrics _directly_ to Datadog, just as the Redis integration would have
+done.  
+
+## The Lambda job
+
+The first part of the equation is our Lambda job:
+[elasticache-slowlog-to-datadog](https://github.com/scribd/elasticache-slowlog-to-datadog)
+which connects to an AWS ElastiCache host (determined by the `REDIS_HOST` parameter),
+gather its slowlogs, and submit a
+[`HISTOGRAM`](https://docs.datadoghq.com/developers/metrics/types/?tab=histogram)
+metric type to Datadog. Basically mirroring the functionality of the Datadog Redis integration.
+
+The application is packaged with its required libraries as a ready-to-deploy
+archive in our [releases
+page](https://github.com/scribd/elasticache-slowlog-to-datadog/releases). To
+deploy directly to AWS from the console, upload the “Full zip distribution” and
+supply the [required
+parameters](https://github.com/scribd/elasticache-slowlog-to-datadog#parameters).
+I’d recommend using our Terraform module, however.
+
+## The Terraform Module
+
+The second part of the equation is the Terraform module:
+[terraform-elasticache-slowlog-to-datadog](https://github.com/scribd/terraform-elasticache-slowlog-to-datadog)
+which will apply the elasticache-slowlog-to-datadog Lambda job to target AWS accounts
+and ElastiCache instances. 
+
+When Lambda jobs include libraries that must be vendored in, as
+`elasticache-slowlog-to-datadog` does, the existing patterns include [building
+locally, or uploading artifacts to
+S3](https://www.terraform.io/docs/providers/aws/r/lambda_function.html#specifying-the-deployment-package).
+However, I like the approach of maintaining a separate repository and build
+pipeline, as this works around Terraform’s [intentionally limited build
+functionality](https://github.com/hashicorp/terraform/issues/8344#issuecomment-361014199).
+The terraform module consumes the
+[elasticache-slowlog-to-datadog
+artifact](https://github.com/scribd/terraform-elasticache-slowlog-to-datadog/blob/master/main.tf#L97).
+
+## Usage
+
+To deploy elasticache-slowlog-to-datadog via Terraform, add the following to your terraform file: 
+
+```
+module slowlog_check {
+  source                      = "git::https://github.com/scribd/terraform-elasticache-slowlog-to-datadog.git?ref=master"
+  elasticache_endpoint        = "master.replicationgroup.abcdef.use2.cache.amazonaws.com"
+  elasticache_security_groups = ["sg-12345"]
+  subnet_ids                  = [ "subnet-0123456789abcdef", "subnet-abcdef1234567890", "subnet-1234567890abcdef", ]
+  vpc_id                      = "vpc-0123456789abcdef"
+  datadog_api_key             = "abc123"
+  datadog_app_key             = "abc123"
+  namespace                   = "example"
+  env                         = "dev"
+  tags                        = {"foo" = "bar"}
+}
+```
+
+## Conclusion
+
+Using AWS Lambda, we can supplement the metrics we get natively from Datadog’s AWS ElastiCache integration. 
+
+Stay apprised of future developments by watching our release pages: 
+
+- [elasticache-slowlog-to-datadog](https://github.com/scribd/elasticache-slowlog-to-datadog/releases)
+- [terraform-elasticache-slowlog-to-datadog](https://github.com/scribd/terraform-elasticache-slowlog-to-datadog/releases)
diff --git a/_posts/2020-04-29-monitoring-aws-with-panther.md b/_posts/2020-04-29-monitoring-aws-with-panther.md
new file mode 100644
index 0000000..da5ceec
--- /dev/null
+++ b/_posts/2020-04-29-monitoring-aws-with-panther.md
@@ -0,0 +1,68 @@
+---
+layout: post
+title: "Using Panther to monitor AWS infrastructure"
+tags:
+- monitoring
+- aws
+- featured
+- archived
+team: Security Engineering
+author: paha
+---
+
+***NOTE***: *Scribd’s security infrastructure has since evolved away from using Panther*
+
+Before widespread cloud usage, it was uncommon for one person to be present for the entire datacenter development lifecycle. Very few people knew how to design and build a datacenter from scratch while ensuring appropriate security configuration settings were set, on top of rigging up monitoring. It was even more uncommon for non-sysadmins to have any involvement in data center infrastructure construction or ongoing refinement. The cloud is very different. It only takes seconds to create an entire infrastructure from a template. And even developers are doing it!
+
+The monitoring challenges for such a scenario are significant. There aren't necessarily "more" monitoring data points, but the speed with which infrastructure can be created tends to result in infrastructure getting way out over its skis with respect to monitoring. Furthermore, since many barriers to entry for doing stupid things have been lowered to the point of non-existence, monitoring is the last great hope of maintaining control over a cloud environment. While access controls can still provide some guardrails, the flexibility that all engineers need to do their jobs requires that they have the ability to do "dangerous" things that they've never had to do before. The true definition of "full stack" has expanded.
+
+# We're moving!
+
+Scribd is in the midst of migrating our entire infrastructure from a legacy data center to AWS. Things are moving fast. We've given developer teams nearly complete access to their own AWS accounts. We've labeled these accounts "development environments" and haven't created any cross-connections between them and production system accounts, but developers still have a lot of power, much more than they had in the legacy environment.
+
+The AWS cloud has a few important saving graces to manage the new chaos, for which there isn't really an analogue in traditional data centers: universal event logging in a standard format for all resources, and highly granular permissions settings in a consistent format. Universal event logging in legacy centers was usually an asymptote that mortal sysadmins and security engineers could never actually reach. This was due to the inability to get complete data off of a device, inability to properly parse the data that could be exported, or a combination of both. [AWS CloudTrail](https://docs.aws.amazon.com/cloudtrail/) solves both problems.
+
+It was also very difficult to precisely define user permissions for infrastructure work in legacy environments. At Scribd, this resulted in a small cadre of sysadmins having root access to everything and no one else having access to anything. With [AWS IAM](https://docs.aws.amazon.com/iam/), the [Instance Metadata Service (IMDS)](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html), or some combination of the two, access permissions can be easily set in a consistent format for any type of infrastructure resource.
+
+# A new solution to an old problem
+
+Unfortunately, native AWS services can't fully take advantage of the power that its event logging and permissions settings provide. Scribd needed a monitoring solution that could keep up with our expanding infra, alerting us when certain events occurred or when permissions were set inappropriately.
+
+We recently deployed the [Panther](https://www.runpanther.io) monitoring system in several AWS accounts. Right out of the box, it enables us to see certain near-real-time changes in these accounts, such as changes in security groups, using [AWS EventBridge](https://docs.aws.amazon.com/eventbridge/) as a base. It also performs a daily configuration check for a defined set of configuration options, such as S3 buckets' public writeability and the existence of MFA on root IAM user accounts. We currently have alerts for events and policy failures sent to a dedicated Slack channel. There is also a historical CloudTrail search functionality that makes hunting for events easy. The newest feature allows pivoting across multiple log sources. In other words, "correlations". That's what a [SIEM](https://en.wikipedia.org/wiki/Security_information_and_event_management) is built for.
+
+The other major power of Panther is extensibility. We can write custom "rules" (for events) and "policies" (for configurations) in Python.
+ This policy checks for resources in monitored accounts that exist outside designed AWS regions:
+
+```python
+APPROVED_REGIONS = {
+	"us-east-1",
+	"us-east-2",
+}
+
+def policy(resource):
+    if resource['ResourceType'] == 'AWS.EC2.NetworkACL' and resource.get("IsDefault"):
+        return True
+    if resource['ResourceType'] == 'AWS.EC2.VPC' and resource.get("IsDefault"):
+        return True
+    if resource['ResourceType'] == 'AWS.EC2.SecurityGroup' and resource.get("Name")=='default':
+        return True
+    return resource['Region'] in APPROVED_REGIONS
+```
+
+We can create highly granular alerts on similarly granular IAM permissions. Anyone who can read basic Python and understands AWS terminology can make new rules and policies. The sky's the limit on custom logic. The rules and policies aren't just static checkboxes, either. We can store all of them in our version repository, as Panther has the "[panther-analysis-tool](https://docs.runpanther.io/scanning/policies#writing-policies-with-the-panther-analysis-tool)" to allow batch upload to the system.
+
+The infrastructure is entirely inside our accounts, and is built almost entirely from CloudFormation templates and lambdas created by Panther. Since we use Terraform exclusively at Scribd for our own infrastructure, we translated some minimal IAM-related templates in Terraform (one of which we've contributed to Panther's [open source repo](https://github.com/panther-labs/panther/blob/master/deployments/auxiliary/terraform/panther_cloudsec_iam/main.tf)). We felt that the translations were a good idea because they involve setting custom IAM role names to enable certain cross-account access. They also won't need to be modified during a Panther upgrade.
+
+The infra is also entirely serverless. The main expense is the storage space for processed CloudTrail logs, which we can control quite precisely.
+
+Deploying (or upgrading) Panther is just a matter of forking the [open-source repo](https://github.com/panther-labs/panther) on Github and running a deployment script in a Docker container, either from a personal laptop or on an EC2 instance in AWS.
+
+We haven't made many changes to the custom rules and policies yet, but even the pre-baked ones are useful. Example event alerts we've received already include "EC2 Network Gateway Modified" (something quite important for checking continuity to the internet), "EC2 Route Table Modified" (ditto), and "EC2 Security Group Modified". It's worth reiterating that these alerts come in within 30 seconds of the action taking place. Policy alerts like "AWS VPC Default Security Group Restricts All Traffic" have already shown up, too. (All of these alerts were triggered as I was making a Terraform module to manage default VPC resources.)
+
+# Future plans
+
+More pre-built custom device log format parsing rules are on the horizon. Other SIEM solutions have attempted to do this (even Datadog is trying it now!), but it's often proved difficult for vendors to keep their parsing rules current. At a previous position, I remember realizing that A Large SIEM Vendor was years behind on parsing the native log format of A Large Firewall Company, and that many very important new fields were being dropped before making it to the store of processed data. It seems Panther wants to solve this by making all of the parsing code open-source, as well. Hopefully this motivates device vendors to keep their own Panther parsers up to date.
+
+Panther Labs is also planning to provide granularity to alert channels. Alerts will be assigned to, e.g., specific Slack channels for specific teams. This will allow teams to keep an eye on their own infrastructure in ways they haven't been able to do before at Scribd. Broad visibility of the entire ecosystem will allow more efficient incident response by quickly routing the right information to the right people.
+
+Overall, Panther has proved incredibly easy for a small security team to roll out to a multi-account enterprise environment, and it looks like we have an easily scalable and maintainable roadmap for the future.
diff --git a/_posts/2020-06-24-shipping-rust-to-production.md b/_posts/2020-06-24-shipping-rust-to-production.md
new file mode 100644
index 0000000..a02c6a0
--- /dev/null
+++ b/_posts/2020-06-24-shipping-rust-to-production.md
@@ -0,0 +1,143 @@
+---
+layout: post
+title: Ingesting production logs with Rust
+tags:
+- rust
+- syslog
+- hotdog
+- featured
+team: Core Platform
+author: rtyler
+---
+
+When we set our goals at the beginning of the year "deploy Rust to production"
+was not among them, yet here we are. Our pattern of deploying small services in containers
+allowed us to easily bring Rust into production, replacing a difficult to
+manage service in the process. In January, the Core Platform team started working on a
+project called "View Analytics". The effort was primarily to replace an aging
+system which was literally referred to as "old analytics." As part of the View
+Analytics design we needed to provide an entry point for [Fastly](https://fastly.com) to relay access logs as  syslog
+formatted messages which could then be written into [Apache Kafka](https://kafka.apache.org), driving the entire
+View Analytics data pipeline. Our initial rollout shipped an [rsyslog](https://www.rsyslog.com)-based solution
+for the “rsyslog-kafka” service.. Using rsyslogd worked fairly well, but had a
+couple of significant downsides. Last month, we deployed its replacement: a
+custom open source daemon written in Rust: [hotdog](https://github.com/reiseburo/hotdog) 🌭.
+
+(**Note:**  _This specific use-case was well suited to Rust. That does not mean
+that anything else we do at Scribd should or will necessarily be written in
+Rust._)
+
+
+## Problems with rsyslog
+
+rsyslog is one of those tools that seems to have existed since the dawn of
+time. It is incredibly common to find in logging infrastructure since it routes
+just about any log from any thing, to any where. Our first iteration of the
+aforementioned `rsyslog-kafka` service relied on it because of its ubiquity. We
+had a problem that looked like routing logs from one thing (Fastly) to another
+thing (Kafka), and that's basically what `rsyslogd` does!
+
+However, when explaining to colleagues what rsyslog
+_really_ is, I would describe it as "an old C-based scripting engine that just
+happens to forward logs." If they didn't believe me, I would send them the
+documentation to
+[Rainerscript](https://rsyslog.readthedocs.io/en/latest/rainerscript/), named
+after [Rainer Gerhards](https://en.wikipedia.org/wiki/Rainer_Gerhards), the
+author of `rsyslog`. I find it incredibly difficult to work with, and even harder to test.
+
+In our pipeline, we needed to bring JSON formatted messages from Fastly and
+route them to the appropriate topics, using the approximate format of:
+
+```json
+{
+  "$schema"   : "some/jsonschema/def.yml",
+  "$topic"    : "logs-fastly",
+  "meta"      : {
+  },
+  "url"       : "etcetc",
+  "timestamp" : "iso8601"
+}
+```
+
+JSON parsing in rsyslog is feasible, but not easy. For example, there
+is no way to handle JSON keys which use the dollar-sign `$`, because the
+scripting interpreter treats `$` characters as variable references. The
+original version of our rsyslog-kafka gateway that went into production
+uses regular expressions to fish out the topic!
+
+Unfortunately, the daemon also does not emit metrics or statistics natively in
+a format we could easily get into Datadog. The only way to get the statistics
+we needed would be to ingest statistics written out to a file through a sidecar
+container and report those into Datadog. This would have required building a
+custom daemon to parse the rsyslogd stats output which seemed like a lot of
+work for a little bit of benefit.
+
+We didn't know how this difficult and untestable service would actually run in production.
+
+
+## Makin' hotdogs
+
+Bored one weekend with nothing to do, I asked myself “how hard could getting syslog into Kafka be?” As it turned out: _not that hard_.
+
+I continued to improve [hotdog](https://github.com/reiseburo/hotdog) over a number of
+weeks until I had feature parity with our rsyslogd use-case, and then some!
+
+*  RFC 5424/3164 syslog-formatted message parsing
+*  Custom routing based on regular expression or [JMESPath](https://jmespath.org/) rules
+*  syslog over TCP, or TCP with TLS encryption
+*  Native statsd support for a myriad of operational metrics we care about
+*  Inline message modification based on simple Handlebars templates
+
+Since the rsyslog-kafka service is deployed in a Docker container, we deployed
+a new build of the container with 🌭 inside to our development environment and
+started testing. After testing looked to be going well, we deployed to
+production at the end of May.
+
+Overall the process went well!
+
+
+## What was learned
+
+The biggest take-away from this effort has been the power of small services
+packaged into Docker containers. The entire inside of the container changed,
+but because the external contracts were not changed the service could be
+significantly modified without issue.
+
+The original implementation was ~2x slower than rsyslog and required a doubling
+of the number of containers running in ECS. The poor performance almost
+entirely came to laziness in the original Rust implementation. Repeated parsing
+of JSON strings, reallocations, and unnecessary polling.
+
+The performance issues were easily identified and fixed with the help of the
+`perf` on Linux (`perf record --call-graph dwarf` is wonderful!) That said, I
+am still quite impressed that a completely unoptimized Rust daemon, built on
+[async-std](https://async.rs), was performing reasonably close to a
+finely-tuned system like `rsyslogd`. While I haven't done a conclusive
+comparison, now that hotdog has been optimized I would guesstimate that it is
+with +/-10% performance parity `rsyslogd`.
+
+![Hotdog and Datadog](/post-images/2020-06-hotdog/hotdog-metrics.png)
+
+
+Having full control over the syslog entrypoint proved valuable almost
+immediately. During a pairing session with my colleague Hamilton, he expressed the
+desire for an additional metric: per-topic message submission counters. In
+`rsyslogd` the metric doesn't exist in any form, but because hotdog was built to
+support statsd out of the box, we made a one-line change adding the new metric
+and our visibility went up almost immediately!
+
+
+The syslog-to-Kafka gateway was a critical piece of the overall View Analytics
+data pipeline, but having such a system available has already paid dividends. A
+number of other internal projects have taken advantage of the ability to route
+syslog traffic into Kafka.
+
+---
+
+
+Scribd has a number of services deployed in production using Ruby, Golang,
+Python, Java, and now a little bit of Rust too. As far as weekend hacks go,
+[hotdog](https://github.com/reiseburo/hotdog) worked out quite well, if you
+have thousands of log entries per second that you need to get into Kafka, give
+it a try!
+
diff --git a/_posts/2020-06-26-streaming-with-delta-lake.md b/_posts/2020-06-26-streaming-with-delta-lake.md
new file mode 100644
index 0000000..62f67f4
--- /dev/null
+++ b/_posts/2020-06-26-streaming-with-delta-lake.md
@@ -0,0 +1,181 @@
+---
+layout: post
+title: Streaming data in and out of Delta Lake
+tags:
+- databricks
+- real-time
+- kafka
+- featured
+team: Core Platform
+author: rtyler
+---
+
+
+With [Delta Lake](https://delta.io) we don't have the lines between
+streaming and batch data typically found in data platforms.  Scribd
+developers can treat data as real-time as they wish! Delta Lake enables some
+workloads to treat data sets like they are traditional "batchy" data stores,
+while other workloads work with _the same data_ as a streaming source or sink.
+This immense flexibility allows our data engineers and scientists to mix and
+match data quickly, providing the business with valuable results at
+unprecedented speed.
+
+At its core Delta Lake combines [Apache Parquet](https://parquet.apache.org/) with a transaction log. This simple
+foundation enables _incredible_ higher level data-access behaviors from [Apache Spark](https://spark.apache.org), which powers the vast majority of our data platform at Scribd.
+When we first considered building a
+[Real-time Data Platform](/blog/2019/real-time-data-platform.html)
+the storage layer was "to be determined". In hindsight, I cannot imagine how a
+team of less than ten developers could have successfully delivered on the
+"Real-time" vision in so short a time. Much of that success rests on adopting
+Delta Lake, and in this post I would like to share some of the motivations,
+valuable features, and perhaps most importantly **caveats** to adopting Delta
+Lake for streaming data needs.
+
+
+## Beforetimes
+
+Storage is the most foundational component of a data platform, and we were in
+bad shape at the beginning of effort. The original storage layer was built on top
+of [HDFS](https://en.wikipedia.org/wiki/HDFS), which was a _very_ reasonable decision at the time. Unfortunately as the years
+went on, our use of HDFS did not keep up with the times. Technical debt accrued in many forms:
+
+* Uncompressed data
+* Multiple different file types, depending on what era a partition of data was written in, it might be Parquet, ORC, RCFile, or just dumb plaintext.
+* [Small files](https://www.quora.com/What-is-the-small-file-problem-in-Hadoop?share=1), over 60% of the files in the cluster were considered "small files".
+
+
+![HDFS is fine](/post-images/2020-06-delta-lake/this-is-fine.png)
+
+
+The storage layer was failing to meet our _batch_ needs well before we had even
+considered layering streaming data on top of it.
+
+
+## Welcome to the future
+
+[Delta Lake](https://delta.io) solved a **lot** of the problems we had, and
+even a few we did not know we had yet! We adopted Delta Lake inline with our shift into the cloud, which I recently wrote about on the Databricks blog:
+[Accelerating developers by ditching the data center](https://databricks.com/blog/2020/06/10/accelerating-developers-by-ditching-the-data-center.html).
+Yet, Delta Lake wasn't our first choice and didn't motivate our shift to AWS.
+Our original prototype consisted of writing Parquet files to S3, where we
+immediately noticed potential problems.
+
+### Data Consistency
+
+S3 is _eventually consistent_. If you create an object `bucket/foo.gz`, you can
+retrieve `bucket/foo.gz` immediately, but other clients issuing list or
+metadata commands may see `foo.gz` appear at different times. In a system where
+one job is writing data into a bucket and another is reading data out of that
+bucket, **consistency** becomes a major concern. Many organizations solve this
+by deploying
+[S3Guard](https://hadoop.apache.org/docs/r3.1.1/hadoop-aws/tools/hadoop-aws/s3guard.html)
+which helps address the problem. Delta Lake provides us with **ACID transactions**
+that make the entire data consistency question moot.
+
+> What I wrote to storage is exactly what the other job will read
+
+### Streaming to Delta
+
+Delta Lake makes building a streaming platform almost _trivial_ with two key
+higher level behaviors: streaming sinks and sources. Like practically any data
+store you can stream data _into_ a Delta table, though Delta's transactions
+make this a much safer operation when deploying on eventually consistent data
+stores like S3. Delta tables can however also act as a **source** for another streaming consumer.
+
+In Scribd's deployment, this allows us to have some Spark Streaming jobs which
+are consuming directly from [Apache Kafka](https://kafka.apache.org), while
+other downstream streaming jobs consume _from Delta tables themselves_.
+
+My [previous post](/blog/2020/shipping-rust-to-production.html) alluded to the "View Analytics" project, which relies heavily on Delta Lake's streaming support:
+
+![View Analytics data pipeline](/post-images/2020-06-delta-lake/view-analytics.png)
+
+
+By utilizing Delta tables as _streaming sources_ in the pipeline above, we
+enable ad-hoc and other batch workloads to query data within the pipeline as if
+it were just another table in the catalog. This is **key** for us because it
+means many of our data consumers are _only_ interacting with Delta tables,
+rather than having to switch between pulling data from Kafka topics and
+separately from batch tables. Delta Lake allows the data consumers to treat
+them all as just tables, although some are a little more real-time than others!
+
+#### Caveats
+
+Delta tables within a streaming context do have some limitations which are
+important to be aware of when designing a system. Many of our lessons learned
+came by partnering with [Databricks](https://databricks.com) during the design
+phase of the View Analytics project. Fortunately they were able to identify
+some ways in which our original designs would have failed before we ended up
+building and deploying anything to production.
+
+Some things to keep in mind:
+
+* Multiple streams can _append_ to the same table concurrently, *but* if there
+  are any non-append writers (e.g. [merge writers](https://docs.delta.io/latest/delta-update.html)) then no other
+  writers should run concurrently with the non-append writer. There are some distinctions here depending on whether the jobs are running in a Databricks runtime or not, and whether those jobs are running in the same workspace. Generally speaking it's best to only use append-only tables as streaming sources.
+* When there are any non-append writers, an optimize cannot run externally. In essence it should be executed inline in a streaming job when the merge writer is not running, i.e. periodically within a `foreachBatch`. Locking features only available in the Databricks runtime may allow for concurrent upsert writers, but your mileage may vary!
+* [Checkpoints](https://spark.apache.org/docs/latest/streaming-programming-guide.html#checkpointing) must be managed *carefully*. each checkpoint location should belong exclusively to a single write stream. restarts of the job must always use the same checkpoint location. do not reference the same checkpoint location from multiple write streams as they will overwrite each others checkpoints (very bad).
+
+
+### Optimize
+
+Building further upon the foundation laid by transactions, Delta Lake provides
+an `OPTIMIZE` command, which helps prevent the small files problem entirely.
+
+In the streaming context, it is highly unlikely that event data will come in
+perfectly even-sized batches that can be written to storage. At a high level,
+when the optimize command is run it will:
+
+* Create a new transaction
+* Take a bunch of small files
+* Combine them together as larger, well-sized and compressed files
+* Write those to storage
+* Completes the transaction.
+
+
+This can all be done _online_ which means whatever data is streaming into the table can continue to stream.
+
+
+**NOTE:** Since optimize doesn't delete those small files after it operates, periodic [vacuum commands](https://docs.delta.io/0.3.0/delta-utility.html#vacuum) are necessary to reduce storage bloat.
+
+
+### General caveats
+
+Delta Lake is a great piece of technology for streams and batch workloads
+alike, regardless of how it is used, there are some general limitations to bear
+in mind. Perhaps the most notable is that it can/should _only_ be accessed from
+Apache Spark. This usually means that a Spark cluster must be launched in order
+to read or write from Delta tables, and the corresponding costs associated with
+that should be known. While there are
+[efforts](https://github.com/reiseburo/delta.rs) to provide client APIs
+external to Spark, nothing yet exists which is production ready.
+
+We are Databricks customers, which means we're actually using the proprietary
+Delta libraries which are include in the Databricks Spark runtimes. In our
+testing and development we have observed a number of important gaps between
+what is available in the open source delta.io client libraries, compared to
+what is present in the Databricks version. These gaps have mostly affected our
+ability to do fully local testing, dictating a certain amount of manual testing
+for Spark streaming jobs launched in Databricks, before they are deployed to
+production.
+
+
+There are a number of other caveats when using Delta that are important to be
+aware of, and while I cannot cite every single one here I will stress: read the
+documentation **thoroughly**. There are a number of callouts in the docs which
+highlight a constraint or behavior which will impact the design of streaming
+systems you may build on Delta Lake. Very little of what we learned working
+with Databricks was _not_ documented, in almost all cases we had misread,
+misunderstood, or simply missed a salient detail in the documentation.
+
+
+---
+
+We started working on the real-time data platform in January, our
+first production streaming workloads were deployed by March. Adopting Delta
+Lake allowed us to move quickly and deploy streaming systems at an incredible
+pace. For a company that has a long history of batch workloads, the sudden
+arrival of streaming data has been transformative. Rather than waiting 24-48
+hours for data in some cases, data consumers are able to access newly written
+data within _seconds_.
+
diff --git a/_posts/2020-08-10-datadog-backup.md b/_posts/2020-08-10-datadog-backup.md
new file mode 100644
index 0000000..736627d
--- /dev/null
+++ b/_posts/2020-08-10-datadog-backup.md
@@ -0,0 +1,92 @@
+---
+layout: post
+title: Backing up Datadog and Performing Bulk Edits
+tags:
+- datadog
+- monitoring
+- featured
+team: Core Infrastructure
+authors:
+- jimp
+- kamranf
+
+---
+
+
+What would happen if someone accidentally deleted a dashboard or important
+monitor in Datadog? How would we know that it had changed? All of our monitoring
+and metrics at Scribd are configured in [Datadog](https://www.datadoghq.com), so
+it was important to have a mechanism to **track changes** across all these
+resources and be able to **revert back** to a previous state in case of a bad
+change. We also wanted the ability to **search and edit** across all dashboards
+and monitors (to refactor a tag, for example). 
+
+When composing dashboards and monitors, immediate feedback is critical.
+Therefore, as we evaluated existing tools, we looked for the ability to coexist
+with Datadog’s native user interface. 
+
+Finding none that exactly fit our needs, we wrote [Datadog
+Backup](https://github.com/scribd/datadog_backup), an open source Ruby gem.
+Datadog Backup focuses on the ability to perform global backups and bulk editing
+without impeding the free use of the Datadog User Interface.
+
+Aspects of this project were inspired by the likes of
+[Doggy](https://github.com/Shopify/doggy). As you consider your approach to
+Datadog management, we highly recommend you also take a look at it and other
+client libraries at
+[https://docs.datadoghq.com/developers/libraries/#datadog-client-community-libraries](https://docs.datadoghq.com/developers/libraries/#datadog-client-community-libraries).
+
+
+## How to use Datadog Backup
+
+The intended use case for Datadog Backup is as a component of a Github Action
+workflow, or similar CD pipeline, that takes regular backups of Datadog, then
+commits to a git repository. This enables such a repository to be used as an
+audit trail, capturing the state of Datadog configuration at regular intervals.
+
+A local backup will additionally enable performing a search against the YAML or
+JSON definitions and performing mass updates which can then be pushed back to
+Datadog. To use the tool to make a mass edit to Datadog configuration, one
+modifies the repository locally, then runs the command in “restore” mode.
+
+
+### Running the tool
+![demo](https://github.com/scribd/datadog_backup/raw/434268667359500ee3739b1fb3cb595a46ae43a8/images/demo.gif)
+
+```
+gem install datadog_backup
+export DATADOG_API_KEY=abc123 
+export DATADOG_APP_KEY=abc123
+
+# Perform backup to optional/path/to/backupdir using YAML encoding
+datadog_backup backup --backup-dir optional/path/to/backupdir
+
+# Make some changes
+
+# Just review the changes since last backup
+datadog_backup diffs --backup-dir optional/path/to/backupdir
+
+# Review and apply local changes to datadog
+
+datadog_backup restore --backup-dir optional/path/to/backupdir
+```
+
+### Using the tool in Github Actions
+
+Included in the git repo for Datadog Backup is an [example Github Actions
+workflow](https://github.com/scribd/datadog_backup/tree/master/example) for
+periodically backing up your Datadog deployment.
+
+### Further development
+
+Some areas to further expand the gem are:
+
+-   Backup of metadata
+
+If you find this Gem useful, and would like to expand on it, [contributions are
+welcome](https://github.com/scribd/datadog_backup)! Find out more about what the
+Core Infrastructure team is up to by [reading more of our
+posts](https://tech.scribd.com/blog/category/core-infrastructure#posts).
+If shipping metrics and managing cloudy things is up your alley, you just might
+be the kind of person who'd love to work here, so [click
+here](https://www.scribd.com/about/engineering) to review our open positions!
diff --git a/_posts/2020-08-26-engineering-firstweek.md b/_posts/2020-08-26-engineering-firstweek.md
new file mode 100644
index 0000000..2104c0e
--- /dev/null
+++ b/_posts/2020-08-26-engineering-firstweek.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "My first week in Scribd engineering"
+tags:
+- featured
+team: Core Infrastructure
+author: jasond
+---
+
+
+Hello everyone, I'm new at Scribd and here's what the first week was like on the Core Infrastructure engineering team. It's full of smart people, great culture, cool tech and I wanted to share my experience.
+
+The journey starts of course, with an interview phase. I've been through the wringer before but this round has to be one of the most pleasant interview experiences I have had. From start to finish I was clear on next steps and what to expect. The recruiter was awesome, she scheduled prep meetings with me to answer questions before interviews. I got to meet my manager, the team director, all but one of my teammates, who was on PTO, and folks from other teams, even teammates in Amsterdam. The process was entirely remote due to the pandemic but that didn't take away from the experience. To me it felt just as personal as an onsite would have been by making excellent use of Google Meet.
+
+When joining a new company, there is always tons to learn and it can be a little overwhelming. Thankfully I was not alone at my first day orientation, I was joined by a group of new hires. Since we were all in the same boat, we started our own new-hire slack channel to support each other. To get started there was a helpful and thorough checklist for newbies to walk through for setting up the basics of our benefits, systems access, and other necessities. The team specific on-boarding guide focused on setup tasks that pertain to tools I would use for everyday work. The best part though, has got to be that I got a new buddy! Scribd pairs new-hires with buddies to show you the ropes. It goes a long way to alleviate a bit of that new-job tension. You can go to your new-hire buddy for anything from on-boarding to team specific questions. In my case my buddy is a tech lead so he also had a wealth of knowledge on our stack or knows who to point you to for answers.
+
+I was happy to hear that Scribd does new hire meet and greets to introduce managers and key people from other teams to start building collaborative relationships. Typically they are from teams you'd work closely with so that you can add more faces to your cache and get to know people. This is huge in getting comfortable with the team's area of responsibilities and who knows what. Everyone I've met at Scribd is crazy smart but more importantly super friendly which makes growing into the new job much smoother and easier. There are fun Slack channels for all kinds of things like coffee, cooking, pets, sports, cars, you name it making it easy to nerd out and meet new people.
+
+As for the tech stack, some of it was completely new to me, but that will make it exciting to learn all new things and get different perspectives on solving problems. It also makes for many learning opportunities from the folks who are subject-matter experts on areas of the stack. Core Infrastructure builds and maintains the foundation that is shared by many other engineering teams. This means that I get to keep up with Terraform, AWS, Kubernetes and more fresh new tech that becomes available in the infrastructure space. For me the cool thing is being part of many engineering efforts and getting to share and learn knowledge from each team and each project.
+
+Any first week at Scribd is going to be memorable but I was especially lucky in that my first day [Scribd announced the acquisition of SlideShare](https://blog.scribd.com/home/welcome-slideshare-to-the-scribd-community). I can't guarantee that your first day will be _that_ exciting, but I'm certain [your first week will be a good one](https://tech.scribd.com/careers/#open-positions).
diff --git a/_posts/2020-09-08-qp-at-airflow.md b/_posts/2020-09-08-qp-at-airflow.md
new file mode 100644
index 0000000..aefa27f
--- /dev/null
+++ b/_posts/2020-09-08-qp-at-airflow.md
@@ -0,0 +1,27 @@
+---
+layout: post
+title: "Airflow Summit 2020: Teaching an old DAG new tricks"
+tags:
+- featured
+- airflow
+- airflow-series
+- datapipe
+team: Core Platform
+author: qphou
+---
+
+This summer I was able to present at [Airflow
+Summit](https://airflowsummit.org/) on the progress we have made migrating from
+our legacy orchestrator to [Apache Airflow](https://airflow.apache.org) in the
+cloud. The video is available on
+[YouTube](https://www.youtube.com/watch?v=DHDlD-bMM3c) and embedded below.
+
+<center><iframe width="560" height="315" src="https://www.youtube-nocookie.com/embed/DHDlD-bMM3c" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe></center>
+
+For more background see my previous blog posts:
+
+* [Teaching an old DAG new tricks](/blog/2020/modernizing-an-old-data-pipeline.html)
+* [Breaking up the Airflow DAG monorepo](/blog/2020/breaking-up-the-dag-repo.html)
+
+I hope you enjoy!
+
diff --git a/_posts/2020-09-09-spark-ai-summit-delta-lake.md b/_posts/2020-09-09-spark-ai-summit-delta-lake.md
new file mode 100644
index 0000000..88c558a
--- /dev/null
+++ b/_posts/2020-09-09-spark-ai-summit-delta-lake.md
@@ -0,0 +1,42 @@
+---
+layout: post
+title: "Spark and AI Summit 2020: The revolution will be streamed"
+tags:
+- featured
+- databricks
+- spark
+- deltalake
+team: Core Platform
+author: rtyler
+---
+
+Earlier this summer I was able to present at Spark and AI Summit about some of
+the work we have been doing in our efforts to build the [Real-time Data
+Platform](/blog/2019/real-time-data-platform.html).  At a high level,
+what I had branded the "Real-time Data Platform" is really: [Apache
+Kafka](https://kafka.apache.org), [Apache Airflow](https://airflow.apache.org),
+[Structured streaming with Apache Spark](https://spark.apache.org), and a
+smattering of microservices to help shuffle data around. All sitting on top of
+[Delta Lake](https://delta.io) which acts as an incredibly versatile and useful
+storage layer for the platform.
+
+In the presentation I outline how we tie together Kafka,
+Databricks, and Delta Lake.
+
+<center>
+<iframe width="560" height="315" src="https://www.youtube-nocookie.com/embed/YmyCOr9Mr9Y" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+</center>
+
+The presentation also complements some of our
+blog posts:
+
+* [Streaming data in and out of Delta Lake](/blog/2020/streaming-with-delta-lake.html)
+* [Streaming development work with Kafka](/blog/2020/introducing-kafka-player.html)
+* [Ingesting production logs with Rust](/blog/2020/shipping-rust-to-production.html)
+* [Migrating Kafka to the cloud](/blog/2019/migrating-kafka-to-aws.html)
+
+
+I am incredibly proud of the work the Platform Engineering organization has
+done at Scribd to make real-time data a reality. I also cannot recommend Kafka +
+Spark + Delta Lake highly enough for those with similar requirements.
+
diff --git a/_posts/2020-09-15-integrating-databricks-and-datadog.md b/_posts/2020-09-15-integrating-databricks-and-datadog.md
new file mode 100644
index 0000000..4cbc1b3
--- /dev/null
+++ b/_posts/2020-09-15-integrating-databricks-and-datadog.md
@@ -0,0 +1,201 @@
+---
+layout: post
+title: "Integrating Databricks jobs with Datadog"
+author: qphou
+tags:
+- featured
+- databricks
+- datadog
+- datapipe
+team: Core Platform
+---
+
+Batch and streaming Spark jobs are an integral part of our data platform and
+like our other production applications, we need
+[Datadog](https://datadoghq.com) instrumentation. We rely on
+[Databricks](https://databricks.com/customers/scribd) to power those Spark
+workloads, but integrating Datadog and Databricks wasn't turn-key. In this
+post, I'll share the two code snippets necessary to enable this integration: a custom cluster init script, and a special class to load into the Spark job.
+
+Rather than relying on the Spark UI in Databricks, piping these metrics into
+Datadog allows us to build extremely useful dashboards and more importantly
+**monitors** for our Spark workloads that can tie into our alerting
+infrastructure.
+
+
+## Configuring the Databricks cluster
+
+When creating a cluster in Databricks, we setup and configure the Datadog
+agent with the following init script on the driver node:
+
+```bash
+#!/bin/bash
+# reference: https://docs.databricks.com/clusters/clusters-manage.html#monitor-performance
+#
+# This init script takes the following environment variables as input
+#   * DATADOG_API_KEY
+#   * ENVIRONMENT
+#   * APP_NAME
+
+echo "Running on the driver? $DB_IS_DRIVER"
+
+if [[ $DB_IS_DRIVER = "TRUE" ]]; then
+  echo "Setting up metrics for spark applicatin: ${APP_NAME}"
+  echo "Driver ip: $DB_DRIVER_IP"
+
+  cat << EOF >> /home/ubuntu/databricks/spark/conf/metrics.properties
+*.sink.statsd.host=${DB_DRIVER_IP}
+EOF
+
+  DD_INSTALL_ONLY=true \
+      DD_AGENT_MAJOR_VERSION=7 \
+      DD_API_KEY=${DATADOG_API_KEY} \
+      DD_HOST_TAGS="[\"env:${ENVIRONMENT}\", \"spark_app:${APP_NAME}\"]" \
+      bash -c "$(curl -L https://raw.githubusercontent.com/DataDog/datadog-agent/7.22.0/cmd/agent/install_script.sh)"
+
+  cat << EOF >> /etc/datadog-agent/datadog.yaml
+use_dogstatsd: true
+# bind on all interfaces so it's accessible from executors
+bind_host: 0.0.0.0
+dogstatsd_non_local_traffic: true
+dogstatsd_stats_enable: false
+logs_enabled: false
+cloud_provider_metadata:
+  - "aws"
+EOF
+
+  # NOTE: you can enable the following config for debugging purpose
+  echo "dogstatsd_metrics_stats_enable: false" >> /etc/datadog-agent/datadog.yaml
+
+  sudo service datadog-agent start
+fi
+```
+
+The cluster also needs to be launched with the following environment variables
+in order to configure the integration:
+
+  * `ENVIRONMENT=development/staging/production`
+  * `APP_NAME=your_spark_app_name`
+  * `DATADOG_API_KEY=KEY`
+
+
+Once the cluster has been fully configured with the above init script, you can
+then send metrics to Datadog from Spark through the statsd port exposed by the
+agent. All your Datadog metrics will be automatically tagged with `env` and
+`spark_app` tags.
+
+In practice, you can setup all of this using DCS ([customized containers with
+        Databricks
+Container Services](https://docs.databricks.com/clusters/custom-containers.html)) as well.
+But we decided against it in the end because we ran into many issues with DCS
+including out of date base images and lack of support for builtin cluster
+metrics.
+
+
+### Sending custom metrics from Spark
+
+Integrating Statsd with Spark is _very_ simple. To reduce boilerplate, we built
+an internal helper utility that wraps `timgroup.statsd` library:
+
+
+```scala
+import com.timgroup.statsd.{NonBlockingStatsDClientBuilder, StatsDClient}
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.streaming.StreamingQueryListener
+
+import scala.collection.JavaConverters._
+
+/** Datadog class for automating Databricks <> Datadog integration.
+ *
+ * NOTE: this package relies on datadog agent to be installed and configured
+ * properly on the driver node.
+ */
+class Datadog(val appName: String)(implicit spark: SparkSession) extends Serializable {
+  val driverHost: String = spark.sparkContext.getConf
+    .getOption("spark.driver.host")
+    .orElse(sys.env.get("SPARK_LOCAL_IP"))
+    .get
+
+  def statsdcli(): StatsDClient = {
+    new NonBlockingStatsDClientBuilder()
+      .prefix(s"spark")
+      .hostname(driverHost)
+      .build()
+  }
+
+  val metricsTag = s"spark_app:$appName"
+
+  def collectStreamsMetrics(): Unit = {
+    spark.streams.addListener(new StreamingQueryListener() {
+      val statsd: StatsDClient = statsdcli()
+      override def onQueryStarted(queryStarted: StreamingQueryListener.QueryStartedEvent): Unit = {}
+      override def onQueryTerminated(queryTerminated: StreamingQueryListener.QueryTerminatedEvent): Unit = {}
+      override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = {
+        val progress = event.progress
+        val queryNameTag = s"query_name:${progress.name}"
+        statsd.gauge("streaming.batch_id", progress.batchId, metricsTag, queryNameTag)
+        statsd.count("streaming.input_rows", progress.numInputRows, metricsTag, queryNameTag)
+        statsd.gauge("streaming.input_rows_per_sec", progress.inputRowsPerSecond, metricsTag, queryNameTag)
+        statsd.gauge("streaming.process_rows_per_sec", progress.processedRowsPerSecond, metricsTag, queryNameTag)
+        progress.durationMs.asScala.foreach { case (op, v) =>
+          statsd.gauge(
+            "streaming.duration", v, s"operation:$op", metricsTag, queryNameTag)
+        }
+      }
+    })
+  }
+}
+```
+
+To initializing the helper class takes two lines of code:
+
+```scala
+implicit val spark = SparkSession.builder().getOrCreate()
+val datadog = new Datadog(AppName)
+```
+
+Then you can use `datadog.statsdcli()` to create statsd clients from within
+both **driver** and **executors** to emit custom emtrics:
+
+
+```scala
+val statsd = datadog.statsdcli()
+statsd.count(s"${AppName}.foo_counter", 100)
+```
+
+**Note:** : Datadog agent flushes metrics on a [preset
+interval](https://docs.datadoghq.com/developers/dogstatsd/data_aggregation/#how-is-aggregation-performed-with-the-dogstatsd-server)
+that can be configured from the init script. By default, it's 10 seconds. This
+means if your Spark application, running in a job cluster, exits immediately
+after a metric has been sent to Datadog agent, the agent won't have enough time
+to forward that metric to Datadog before the Databricks cluster shuts down. To
+address this issue, you need to put a manual sleep at the end of the Spark
+application so Datadog agent has enough time to flush the newly ingested
+metrics.
+
+
+### Instrumenting Spark streaming app
+
+User of the Datadog helper class can also push all Spark streaming progress
+metrics to Datadog with one line of code:
+
+```scala
+datadog.collectStreamsMetrics
+```
+
+This method sets up a streaming query listener to collect streaming progress
+metrics and send them to the Datadog agent. All streaming progress metrics will
+be tagged with `spark_app` and `query_name` tags. We use these streaming
+metrics to monitor streaming lag, issues with our batch sizes, and a number
+of other actionable metrics.
+
+And that’s it for the application setup!
+
+---
+
+In the future a more "native" integration between Databricks and Datadog would
+be nice, but these two code snippets have helped bridge a crucial
+instrumentation and monitoring gap with our Spark workloads.  On the Core
+Platform and Data Engineering teams we continue to invest in Spark and would
+love your help building out our reliable and high-performance data platform,
+[come join us!](/careers/#open-positions)
diff --git a/_posts/2020-12-04-optimize-databricks-cluster-configuration.md b/_posts/2020-12-04-optimize-databricks-cluster-configuration.md
new file mode 100644
index 0000000..ccf81ad
--- /dev/null
+++ b/_posts/2020-12-04-optimize-databricks-cluster-configuration.md
@@ -0,0 +1,117 @@
+---
+layout: post
+title: "How we optimize Databricks clusters configuration with Apache Airflow"
+author: maksymd
+tags:
+- featured
+- databricks
+- airflow
+- aws
+team: Data Engineering
+---
+
+Scribd recently changed the way we run thousands of data-processing tasks in order to save 10-20% _more_ on our cloud compute costs.  Scribd’s data platform is built on top of [Databricks on AWS](https://databricks.com/customers/scribd) and runs 1500+ Apache Spark batch and streaming applications. To help orchestrate all the batch workloads we also use [Apache Airflow](https://airflow.apache.org/). By default, Databricks provides a rich set of [cluster configuration](https://docs.databricks.com/clusters/configure.html) options. We can use different EC2 instance types, AWS Availability Zones (AZ), spot or on-demand instances, autoscaling, etc. By examining the needs of our workloads, we were able to optimize the way we leverage Databricks and AWS to gain more reliability and 10-20% more cost savings in our data platform.
+
+ 
+ 
+For most clusters which run our batch workloads, we use auto-scaling and spot instances, falling back to on-demand instances. Theoretically, this helps us to save up to 90% on cloud infrastructure. But in the real world using spot instances has some limitations, problems, areas for optimization. In this post, I will share some tips that we use for an optimized Databricks cluster configuration.
+
+![](/post-images/2020-12-databricks/databricks-cluster-configuration-parameters.png)
+<font size="3"><center><i>Databricks cluster configuration parameters</i></center></font>
+ 
+First, let’s review some of the problems we encounter with an “out of the box” Databricks cluster configuration:
+
+* EC2 spot prices are not static across availability zones (AZs). Sometimes price grows up to an on-demand price. 
+![](/post-images/2020-12-databricks/spot-pricing-history.png)
+<font size="3"><center><i>This example shows that the price difference between us-east-1b and us-east-1f was 40%</i></center></font>
+
+* AWS has a limited number of instances for each AZ. When instances for some type are not available AWS throws an exception: “We currently do not have sufficient <instance type> capacity in the Availability Zone you requested”.
+
+* In a Databricks cluster, we must use a predefined cluster configuration with a specified instance type and AZ. It’s not possible to apply some fallback algorithm when the requested instances are not available in the desired AZ.
+
+To address these problems, we set the following goal: We need to be flexible about instance types, availability zones, and provide a fallback mechanism when instances are not available.  Unfortunately, we are not able to solve these problems in the Databricks platform alone. Since our primary interface to Databricks for automated workloads is Apache Airflow, we can customize our Airflow operators to achieve the goal!
+
+**How it works.**
+
+In Airflow we use our own `DatabricksSparkOperator`. In this operator, we set up a cluster configuration which Airflow can then use when submitting jobs to Databricks.
+
+For cluster configuration, we offer two different sets of parameters, one to describe instance types desired for the workload, and the other which describes the Spark execution environment needed by the workload.  Each set is handled by a different optimization strategy:
+
+The instance-based configuration contains a couple key parameters:
+
+* `node_type_id`, and AWS instance type
+* `num_workers`, the minimum/maximum workers for auto-scaling.
+
+To optimize these workloads we use a very simple algorithm based on the [DescribeSpotPriceHistory API](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeSpotPriceHistory.html). We extract all AZs for requested instance type in a specific AWS region and sort these zones by the current spot price (from the cheapest to the most expensive zone). After that, we request Databricks clusters in the first zone from the list (the cheapest).  If AWS doesn’t have enough capacity for a particular instance type in the first AZ we apply fallback and try the next one.
+
+There are a few reasons to try the cheapest zone first. First - it saves up to 10% of our AWS costs. Second - most of the time AWS provides the cheapest price in AZ with a lot of unused spot instances, helping us avoid insufficient capacity errors.
+
+The Spark-execution environment configuration uses different parameters for describing the needs of the Spark job:
+
+* `executor_memory`
+* `executor_cores`
+* `num_workers`, or minimum/maximum number of workers for auto-scaling.
+
+For jobs using these parameters, we apply a separate algorithm: using executor_memory and executor_cores parameters to find all possible instance types that have enough capacity (_num of cores >= executor_cores, memory >= num of cores * executor_memory / executor_cores_), calculate the number of workers (for large instance types we can decrease num_workers value) and the cluster total costs (_num_workers * (current AWS spot price + Databricks price)_). We sort all possible combinations by the cluster totals cost value, memory, ECU and after that, we request a Databricks cluster for the first instance type in the list. If instances for this type are not available we apply fallback and try the next option in the list.
+
+Using this strategy we may save an extra 10% (compared to the first strategy) and have fewer problems with spot instances availability, because we use a different instance type in a different AZ.
+
+These two simple strategies help us to achieve our goal of being flexible with instance types and avoiding capacity problems.
+
+Below you can find examples for each strategy.
+ 
+ Strategy 1: _node_type_id = m5.xlarge, num_workers = 100_:
+
+`DatabricksSparkOperator` uses `AwsBaseHook` to requests spot instance prices for instance-type=m5.xlarge, product-description=Linux/UNIX (Similar request with AWS cli looks like:
+
+```bash
+ aws ec2 describe-spot-price-history --instance-type=m5.xlarge --start-time=$(date +%s) --product-descriptions="Linux/UNIX" --query 'SpotPriceHistory[*].{az:AvailabilityZone, price:SpotPrice}' 
+```
+AWS Response:
+
+```json
+{ "az": "us-east-2a",  "price": "0.042300" }
+{ "az": "us-east-2c", "price": "0.041000" }
+{"az": "us-east-2b", "price": "0.040800" }
+```
+In this situation, we try to request 100 m5.xlarge instances in AZ us-east-2b. If AWS doesn’t have enough capacity in this zone, the `DatabricksSparkOperator` applies a fallback procedure and request cluster in us-east-2c.
+
+Strategy 2: _executore_cores = 4, executor_memory = 8G, num_workers = 100_
+
+`DatabricksSparkOperator` requests spot instance prices for instance-types that have more than 4 cores and more than 2Gb of memory per core from the list of [possible instance types](https://databricks.com/product/aws-pricing/instance-types).
+
+AWS Response contains a lot of different instance types and AZ, but let me truncate this list and leave only a few rows:
+
+```
+{ "az": "us-east-2a",  "price": "0.042300",  "instance_type": m5.xlarge" }
+{ "az": "us-east-2c",  "price": "0.045000",  "instance_type": m5.2xlarge" }
+{ "az": "us-east-2c",  "price": "0.045000",  "instance_type": r5.2xlarge" }
+```
+We also need to include the cost of Databricks, for ease of calculation, we assume that 1 DBU = $0.1.
+
+Databricks prices:
+
+* M5.xlarge (0.69 DBU/Hr)
+* m5.2xlarge(1.37 DBU/Hr)
+* r5.2xlarge(1.37 DBU/Hr)
+
+We calculate cluster total cost using formula:
+<center><i>total_cost = num_of_worker * spot_price * databricks_price * DBU_price</i></center>
+
+Results:
+
+```
+{ "az": "us-east-2a", "price": "0.042300", "instance_type": "m5.xlarge", "num_of_worker" = 100, "total_cost" = "0.29187" }
+// Below clusters require only 50 workers, because 2xlarge instance types have 8 cores and more than 2GB/core 
+{ "az": "us-east-2c", "price": "0.045000", "instance_type": "m5.2xlarge", "num_of_worker" = 50, "total_cost" = "0.30825"}
+{ "az": "us-east-2c", "price": "0.045000", "instance_type": "r5.2xlarge","num_of_worker" = 50, "total_cost" = "0.30825"}
+```
+So, in this situation we try to request 100 m5.xlarge instances in us-east-2a, then 50 r5.2xlarge instances in AZ us-east-2c (total price is similar to m5.2xlarge, but r5 instances has 2x memory for the same money), after that 50 m5.2xlarge in AZ us-east-2c.
+
+After experimenting with this change , we rolled it into our production Airflow environment and are already seeing more reliable job execution and have not seen noticeable impact to batch job performance. There are dozens of ways to optimize data platform costs in the cloud, but a good place to start is by looking at what your workloads actually need!
+
+---
+
+On the Core Platform and Data Engineering teams we continue to invest in Spark\Airflow and would
+love your help building out our reliable and high-performance data platform,
+[come join us!](/careers/#open-positions)
diff --git a/_posts/2020-12-14-Recycle-EKS-Worker-Nodes.md b/_posts/2020-12-14-Recycle-EKS-Worker-Nodes.md
new file mode 100644
index 0000000..5407ad1
--- /dev/null
+++ b/_posts/2020-12-14-Recycle-EKS-Worker-Nodes.md
@@ -0,0 +1,90 @@
+---
+layout: post
+title:  "Automatically recycling EKS worker nodes"
+author: Kuntalb
+tags:
+- eks
+- kubernetes
+- lambda
+- step function
+- terraform
+- featured
+team: Core Platform
+---
+
+A few months ago, we came across a problem we need to upgrade our Kubernetes
+version in AWS EKS without having downtime. Getting the control plane
+upgraded without downtime was relatively easy, manual but easy. The bigger challenge
+was getting the physical worker node updated. We had to manually complete each of the following steps:
+
+1. Create a new worker node with latest configuration
+2. Put the old node in standby mode.
+3. Taint the old node to unschedulable 
+4. Then wait for all our existing pods to die gracefully. In our case, we had some really long running pods, some of which took 20 hours or more to actually finish!
+5. Then detach and kill the old node.
+
+While doing that we were thinking how about having an automated module, which
+will do all these work by just a button click. We are pleased to open source
+and share our [terraform-aws-recycle-eks
+module](https://github.com/scribd/terraform-aws-recycle-eks) which will do all
+these steps for us!
+
+## What Problem does it Solve
+
+1. Periodic recycling of old worker nodes. In fact we can create a lifecycle hook while creating the node and integrate the lifecycle hook with this module. That way the whole periodic recycling will be fully automated via the lifecycle hook and zero downtime via this module, no need for manual intervention at all.
+2. Minimal manual interventions while recycling a worker node.
+3. This can be integrated with SNS/Cloudwatch events, so that in the middle of the night if there is a CPU spike this Step-function can step up and create a new node while allowing the old node to die gracefully. That way all new tasks coming in can be catered in the new node reducing pressure on the existing node while we investigate the root cause and continue to be in service. There are plenty more use cases like this.
+4. This can make upgrading/patching of Kubernetes and eks worker nodes much easier
+5. Also this module has a custom label selector as an input, that will help the user to only wait for the pods that matters. Rest everything this module will ignore while waiting for the pods to gracefully finish
+
+## Components
+
+### Terraform
+
+[Terraform](https://terraform.io) has always been our choice of tool for managing infrastructure, and using terraform for this module also gives us the opportunity to integrate this module with all other existing infra seamlessly.
+
+### Lambdas and Step Functions
+
+[Orchestrating Amazon Kubernetes Service
+(EKS)](https://medium.com/@alejandro.millan.frias/managing-Kubernetes-from-aws-Lambda-7922c3546249)
+from [AWS Lambda and Amazon EKS Node
+Drainer](https://github.com/aws-samples/amazon-k8s-node-drainer) has already
+set a precedent that Lambdas can be a great tool to manage EKS clusters.
+However, Lambdas have one notable limitation in that they are very short lived.
+If we run all steps through a single Lambda function, it will eventually
+timeout while waiting for all existing pods to complete. So we need to split up
+the workflow into multiple Lambdas and manage
+their lifecycles through a workflow manager.  This is where
+[Step Functions](https://aws.amazon.com/step-functions/?step-functions.sort-by=item.additionalFields.postDateTime&step-functions.sort-order=desc) enter the picture.
+Using a Step Function not only solves the problem of Lambda time-outs but also
+provides us an opportunity to extend this module to be triggered automatically
+based on events.
+
+## Design
+
+1. Create a [Step Function](https://github.com/scribd/terraform-aws-recycle-eks/blob/main/step-function.json) that will consist of 4 Lambdas. This step function will handle the transfer of inputs across the Lambda functions.
+2. The [first Lambda](https://github.com/scribd/terraform-aws-recycle-eks/blob/main/lambdas/putNodesToStandby.py) takes an instance id as an input, to put it in standby state. Using autoscaling api to automatically add a new instance to the group while putting the old instance to standby state. The old instance will get into "Standby" state only when the new instance is in fully "Inservice" state
+3. Taint this "Standby" node in EKS using K8S API in [Lambda](https://github.com/scribd/terraform-aws-recycle-eks/blob/main/lambdas/taintNodes.py) to prevent new pods from getting scheduled into this node
+4. Periodically use K8S API check for status of “stateful” pods on that node based on the label selector provided. Another [Lambda](https://github.com/scribd/terraform-aws-recycle-eks/blob/main/lambdas/checkNodesForRunningPods.py) will do that
+5. Once all stateful pods have completed on the node, i.e number of running pod reached 0, shut down that standby instance using AWS SDK via [Lambda](https://github.com/scribd/terraform-aws-recycle-eks/blob/main/lambdas/detachAndTerminateNode.py).
+6. We are not terminating the node, only shutting it down, just in case. In future releases, we will start terminating the nodes
+
+## Sample Execution
+
+![Sample execution output of the step function](/post-images/2020-12-recycle-eks-worker/Step-Function-sample-output.png)
+<font size="3"><center><i>Sample Execution output of the Step Function </i></center></font>
+
+## Future Enhancements
+
+1. First Lambda sleeps for arbitrary 300 seconds to ensure that the new node is in *IN* Service mode before putting the old node to StandBy mode. Ensure this programatically instead of sleeping.
+2. Use a common module for getting the access token.
+3. Better logging and exception handling
+4. Make use of namespace input while selecting the pods. Currently it checks for pods in all namespaces.
+5. Module doesn't work without manual edit of `configmap/aws-auth`. Find a terraform way to edit it.
+
+---
+
+Within Scribd's Platform Engineering group we have a *lot* more services than
+people, so we're always trying to find new ways to automate our infrastructure.
+If you're interested in helping to build out scalable data platform to help
+change the world reads, [come join us!](/careers/#open-positions)
diff --git a/_posts/2020-12-21-sidekiq-incident-learnings.md b/_posts/2020-12-21-sidekiq-incident-learnings.md
new file mode 100644
index 0000000..b09f71c
--- /dev/null
+++ b/_posts/2020-12-21-sidekiq-incident-learnings.md
@@ -0,0 +1,82 @@
+---
+layout: post
+title:  "Learning from incidents: getting Sidekiq ready to serve a billion jobs"
+author: nakulpathak3
+tags:
+- incident response
+- sidekiq
+- monitoring
+- featured
+team: Internal Tools
+---
+
+Scribd currently serves hundreds of Sidekiq jobs per second and has served 25 billion jobs since its adoption 2 years ago. Getting to this scale wasn’t easy. In this post, I’ll walk you through one of our first ever [Sidekiq](https://sidekiq.org/) incidents and how we improved our Sidekiq implementation as a result of this incident.
+
+### The Incident
+
+A large number of jobs for importing podcasts into Scribd were enqueued via Sidekiq. They took many hours to run and since they were added to our “default” queue, all our servers picked them up unlike if they were in the “bulk” queue. These jobs quickly starved all other jobs including the highest priority ones.
+
+**Detection:** The incident was detected by an internal user noticing the queue build-up in Sidekiq’s web UI and a corresponding customer complaint that we linked back to this issue. Our systems were negatively affected for around **7 hours** and the incident was noticed at the 6 hour mark.
+
+**Resolution:** We ran a script on production to delete all existing jobs of this problematic worker from Sidekiq’s Redis instance and removed the batch job that was enqueuing them. We let the currently running jobs finish since killing them would require ssh-ing and running risky sudo commands on production servers.
+
+### What we learned
+
+As pretty much our first ever major Sidekiq incident, we wrote an in-depth incident review that focused on 4 problem areas -
+
+#### Quicker Detection
+
+Our mean-time-to-detect this incident was way too high. To address this, we needed metrics and alerting.  Since we have a Sidekiq Enterprise license, we simply integrated the [Pro](https://github.com/mperham/sidekiq/wiki/Pro-Metrics#enabling-metrics) and [Enterprise metrics](https://github.com/mperham/sidekiq/wiki/Ent-Historical-Metrics) into our existing Ruby Dogstatsd client.
+
+We added the following Datadog monitors -
+
+* X queue latency > Y value over past Z minutes
+* % of job failures / total jobs > X% over last Y minutes
+
+#### Quicker Debugging
+
+To help add some debugging power to the monitors above, we also created some useful dashboards.
+<img src="http://github.com/post-images/2020-12-sidekiq-incident-learnings/overall-sidekiq-dashboard.png" alt="Overall Sidekiq dashboard"/>
+<img src="http://github.com/post-images/2020-12-sidekiq-incident-learnings/overall-worker-metrics.png" alt="Overall worker metrics"/>
+
+We added Sidekiq system-level, queue-level, and worker-level graphs that allow us to quickly go from system health to queue health to erroneous worker.  From there, we can go over to the worker dashboard to find out whether the issue is around processing time or job failures and debug further in Sentry if needed.
+
+<img src="http://github.com/post-images/2020-12-sidekiq-incident-learnings/single-worker-dashboard.png" alt="Dashboard for each worker"/>
+
+Later, as Scribd adopted Datadog further, we added [APM for Sidekiq](https://docs.datadoghq.com/tracing/setup_overview/setup/ruby/#sidekiq) which covered a lot of the functionality we had but also added tracing of worker performance to further debug issues.
+
+#### Quicker Resolution
+
+Now that we’re able to quickly identify incidents and debug them, the next step is to resolve the issue.
+
+Something we learned from the incident was that editing Sidekiq Redis while it is already overloaded is a slow and highly error-prone process. To overcome this, we utilized Sidekiq’s ability to [inject custom middlewares](https://github.com/mperham/sidekiq/wiki/Middleware).
+
+**Job Dropping Middleware:** We created a client middleware that would check a worker’s name against a live feature flag sidekiq_dropped_workers to decide if that worker should execute or be dropped pre-execution. This allowed us to “drain” a specific worker without having to manually edit Sidekiq Redis.
+
+<img src="http://github.com/post-images/2020-12-sidekiq-incident-learnings/sidekiq-dropped-workers.jpeg" alt="Flow diagram for dropping Sidekiq worker jobs"/>
+
+**Job Disabling Middleware:** In some cases, the worker’s issues may be easily resolvable in an upcoming deploy or re-enqueuing the workers may be extremely difficult. To address such a case, we introduced sidekiq_disabled_workers feature flag which utilized Sidekiq’s [ScheduledSet](https://github.com/mperham/sidekiq/wiki/Scheduled-Jobs) to return those jobs to Redis to be run 24 hours later.
+
+<img src="http://github.com/post-images/2020-12-sidekiq-incident-learnings/sidekiq-disabled-workers.jpeg" alt="Flow diagram for temporarily disabling Sidekiq worker jobs"/>
+
+**Job Termination Incidents Page:** Finally, it was essential to find a way to quickly terminate existing problematic workers that have overtaken the queue. Sidekiq’s web UI is also [quite extensible](https://github.com/mperham/sidekiq/issues/3335) so we added a new web page called the “Incidents” tab which allows us to pause queues and terminate existing processes.
+
+<img src="http://github.com/post-images/2020-12-sidekiq-incident-learnings/sidekiq-web-ui.jpg" alt="Sidekiq incidents tab in web UI"/>
+
+#### Future prevention
+
+The team that added the problematic worker was not aware of Sidekiq’s shared model of usage and their worker’s ability to affect the system. They didn’t know when they should be using the default queue or the bulk queue.
+
+**Documentation:** We created processing time and worker importance expectations for each queue. We listed best practices such as using timeouts, preferring multiple smaller jobs, idempotency, etc. and we linked to the [Sidekiq documentation](https://github.com/mperham/sidekiq/wiki/Best-Practices) where we felt people may want more information.
+
+**Runbook:** We also created an Incident Handling runbook that walks people through finding a problematic worker, debugging, and resolving the incident.
+
+<img src="http://github.com/post-images/2020-12-sidekiq-incident-learnings/sidekiq-incident-handling.png" alt="Sidekiq runbook for incident handling"/>
+
+**Guardrails:** We also seriously considered adding timeouts which would forcefully terminate workers that go significantly over their queue’s expected processing time. However, we settled for a Sentry exception for workers that missed our guidelines auto-assigned to the team that owns the worker (via CODEOWNERS file). This approach has been sufficient for us so far.
+
+### Where we are now
+
+Our systems are far from perfect but Sidekiq issues are now recognized within 5-10 minutes of their occurrence and usually resolved with no significant production impact.
+
+When we addressed these incidents, we were running on data center servers but since then we’ve moved our workloads to AWS Fargate tasks. We’d like to add queue-based auto-scaling and the ability for degradation in database performances caused by Sidekiq workers to be recognizable and auto-resolve.
diff --git a/_posts/2021-03-02-github-actions-datadog-reporting.md b/_posts/2021-03-02-github-actions-datadog-reporting.md
new file mode 100644
index 0000000..50fd811
--- /dev/null
+++ b/_posts/2021-03-02-github-actions-datadog-reporting.md
@@ -0,0 +1,55 @@
+---
+layout: post
+title:  "Unifying developer velocity metrics in Datadog with GitHub Actions"
+author: ajhofmann
+tags:
+- monitoring
+- datadog
+- featured
+team: Internal Tools
+---
+
+At Scribd we have a wide variety of projects and repositories that our developers work on everyday. The Internal Tools team is dedicated to creating tooling and automation that empowers developers to deliver code as swiftly as possible. A standardized and unified method to report metrics around developer velocity and CI/CD is therefore key to being able to identify areas for improvement and measure success in improving developer workflows.
+
+### GitHub Actions
+
+[GitHub Actions](https://github.com/features/actions) offers a CI/CD solution to build, test and deploy code directly in GitHub. One of the key features of GitHub Actions is the ability to create an open source action that can be easily used by any other GitHub Actions workflow in a few lines. The current actions on the market range from installing languages like [Ruby](https://github.com/ruby/setup-ruby), [posting messages to Slack](https://github.com/abinoda/slack-action) and all sorts of other [awesome things](https://github.com/sdras/awesome-actions). Some actions provide the ability to report [custom datadog metrics](https://github.com/marketplace/actions/datadog-action) from a workflow, however there weren't any actions that automatically collected, formatted and reported development or developer velocity metrics to Datadog.
+
+### Datadog Reporting in GitHub Actions
+
+Without a solution on the [GitHub Actions marketplace](https://github.com/marketplace?type=actions) to accomplish what we wanted, the Internal Tools team created a GitHub Action that could be used across all of Scribd’s projects and teams to report metrics that give us a view of how fast we are able to deliver from the organization level all the way down to specific projects.
+
+With our now published [open source GitHub Action](https://github.com/scribd/github-action-datadog-reporting) we provide the ability for a quick lightweight job to be added to the end of any GitHub workflow that reports the duration of every job and the entire duration of the workflow directly to [Datadog](https://www.datadoghq.com/). The action can also be integrated into a standalone workflow that calculates and reports how long pull requests take to open, how many lines are changed, and how long the pull request takes to move from open to merge.
+
+Additionally, all of the metrics are automatically tagged by the Action with information such as whether the durations are from passed or failed jobs, as well as the repository, workflow and job that the durations correspond to. This information allows us to create fully customizable Datadog dashboards that can focus on the velocity of the organization, a single team, a project and even a single job in the workflow.
+
+### Putting the Data to Use
+
+Going forward, these unified metrics across the projects will enable the Internal Tools team to identify potential areas of slow down for developers at Scribd, and measure the success of our efforts to enable developers to ship clean and correct code as efficiently as possible.
+
+When all metrics are reported using the same prefix to Datadog, we can leverage tags and templates to easily make dashboards of any scope, from tracking the organization's velocity, all the way down to that of a single project or workflow. When not filtering anything we can see the developer velocity events across all installed projects:
+
+<img src="http://github.com/post-images/2021-03-github-datadog/all-velocity-metrics.png" alt="Three graphs showing the data for time to merge and open pull requests, and lines changed per pull request"/>
+
+
+The time to merge metrics supports tags for team and repository, and so we can easily add filters based on the tags for any single tag or combination of tags.
+
+<img src="http://github.com/post-images/2021-03-github-datadog/team-time-to-merge.png" alt="The Tools teams time to merge graph"/>
+
+One of the key features of the action is tracking job and workflow timing.
+
+<img src="http://github.com/post-images/2021-03-github-datadog/bad-workflow.png" alt="A graph showing significant inrease in workflow duration"/>
+
+<img src="http://github.com/post-images/2021-03-github-datadog/good-workflow.png" alt="A graph showing a stable or decreasing workflow duration"/>
+
+The above graphs represents workflow runtime data collected from two different projects. By filtering the reports using the project and workflow tag, we can watch the workflows for any trends that might be slowing down the workflow and track when the issue started and how significant it is. In the above example it looks like the “First Workflow” is having some performance issues, so let’s break the job duration metric down by the jobs in the workflow.
+
+<img src="http://github.com/post-images/2021-03-github-datadog/job-comparison.png" alt="Four graphs showing job duration, with one showing a significant recent increase"/>
+
+Looking at the job breakdown makes it very clear where our issue is, Job 3 is causing some performance issues in the workflow starting from friday morning, and will need to be fixed. Note that the above graphs have had their workflow and job names obscured for the purposes of the blog.
+
+### Into the Future
+
+With the GitHub Action published on the public marketplace, Scribd will continue to integrate the action across it’s projects for increased monitoring and issue tracking. The code is now [open sourced and available on GitHub](https://github.com/scribd/github-action-datadog-reporting) and contributions are welcome.
+
+ If you would like to join the Internal Tools team or Scribd on our journey then take a look at our careers page.
diff --git a/_posts/2021-03-11-introducing-sql-delta-import.md b/_posts/2021-03-11-introducing-sql-delta-import.md
new file mode 100644
index 0000000..ec708f9
--- /dev/null
+++ b/_posts/2021-03-11-introducing-sql-delta-import.md
@@ -0,0 +1,161 @@
+---
+layout: post
+title: "Importing MySQL Data into Delta Lake"
+author: alexk
+tags:
+- databricks
+- spark
+- deltalake
+- featured
+team: Data Engineering
+---
+
+OLTP databases are a common data source for Data Lake based warehouses which use Big Data tools to run
+batch analytics pipelines. The classic Apache Hadoop toolchain includes
+[Apache Sqoop](https://sqoop.apache.org/) - a tool for bulk import/export
+of data between HDFS and relational data stores. Our pipelines were using this tool as well, primarily
+to import MySQL data into HDFS. When the Platform Engineering team took on the migration of
+our on-premise Hadoop workloads to the [Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse)
+on AWS, we had to write our own tool to import data from MySQL directly into S3-backed [Delta Lake](https://delta.io/).
+In this post I will share the details about `sql-delta-import` - an open source utility we have proposed for inclusion in the
+[Delta Lake
+Connectors](https://github.com/delta-io/connectors/pull/80) project, we're
+looking forward to working with others to improve and accelerate importing data
+into Delta Lake!
+
+### Sample import
+
+Importing data into a Delta Lake table is as easy as
+
+```sh
+spark-submit /
+--class "io.delta.connectors.spark.JDBC.ImportRunner" sql-delta-import_2.12-0.2.1-SNAPSHOT.jar /
+--jdbc-url jdbc:mysql://hostName:port/database /
+--source source.table
+--destination destination.table
+--split-by id
+```
+
+### This looks a lot like `sqoop`... why didn't you just use that?
+
+We considered using `sqoop` at first but quickly dismissed that option for multiple reasons:
+
+#### 1. Databricks Lakehouse Platform does not come with `sqoop`
+Yes we could have ran our sqoop jobs on EMR clusters but we wanted to run everything in Databricks and
+avoid additional technology footprint and overhead. But even if we drop that restriction...
+
+#### 2. `sqoop` does not support writing data directly to Delta Lake
+`sqoop` can only import data as text or parquet. Writing to delta directly allows us to
+optimize data storage for best performance on reads by just adding a couple of configuration options
+
+```sh
+spark-submit /
+--conf spark.databricks.delta.optimizeWrite.enabled=true /
+--conf spark.databricks.delta.autoCompact.enabled=true /
+--class "io.delta.connectors.spark.JDBC.ImportRunner" sql-delta-import_2.12-0.2.1-SNAPSHOT.jar /
+--jdbc-url jdbc:mysql://hostName:port/database /
+--source source.table
+--destination destination.table
+--split-by id
+```
+
+#### 3. `--num-mappers` just not good enough to control parallelism when working with a database
+`sqoop` uses map-reduce under the hood. We can specify `--num-mappers` parameter that controls how many
+mappers will be used to import data. Small number of mappers can result in large volume
+of data per import and long running transactions. Large number of mappers will result in many connections
+to database potentially overloading it especially when there are a lot of `sqoop` jobs running in parallel.
+Additionally since there are no reduce stages in `sqoop` jobs large number of mappers will result in large
+number of output files and potentially introducing a small files problem.
+
+`sql delta import` uses `--chunks` parameter to control number of... well... chunks to split the source table
+into and standard spark parameters like `--num-executors` and `--executor-cores` to control data import
+concurrency thus allowing you to tune those parameters independently
+
+```sh
+spark-submit --num-executors 15 --executor-cores 4 /
+--conf spark.databricks.delta.optimizeWrite.enabled=true /
+--conf spark.databricks.delta.autoCompact.enabled=true /
+--class "io.delta.connectors.spark.JDBC.ImportRunner" sql-delta-import_2.12-0.2.1-SNAPSHOT.jar /
+--jdbc-url jdbc:mysql://hostName:port/database /
+--source source.table
+--destination destination.table
+--split-by id
+--chunks 500
+```
+
+in the example above source table will be split into 500 chunks resulting in quick transactions and released connections
+but no more than 60 concurrent connections will be used for import since max degree of parallelism is 60 (15 executors x 4 cores).
+`delta.optimizeWrite` and `delta.autoCompact` configuration will yield optimal file size output for the destination table
+
+#### 3.1 `--num-mappers` and data skew just don't play nicely together
+
+When `sqoop` imports data, source table will be split into ranges based on `--split-by` column and each mapper
+would import its corresponding range. This works well when `--split-by` column has a near uniform distribution
+of data, but that's not always the case with source tables... As tables age we tend to add additional columns to them to
+take on new business requirements so over time data in latest rows has a higher fill rate than earlier rows.
+
+![row density increase over time](/post-images/2021-03-sql-delta-import/row_density_increase.png)
+
+Our source tables here at Scribd definitely have these characteristics. We also have some tables that have entire
+ranges of data missing due to data cleanup. At some point large chunks of data were just deleted from these tables.
+
+![missing rows](/post-images/2021-03-sql-delta-import/missing_rows.png)
+
+This type of data skew will result in processing time skew and output file size skew when you can only control number of
+mappers. Yes we can introduce additional computed synthetic column in the source table as our `split-by` column but now
+there is an additional column that does not add business value, app developers need to be aware of it, computing and
+storing it takes up database resources and if we plan to use it for imports it's better be indexed, thus even more
+compute and storage resources.
+
+With `sql-delta-import` we still split source tables into ranges based on `--split-by` column but if there is data
+distribution skew we can "solve" this problem by making number of chunks much larger than max degree of parallelism.
+This way large chunks with high data density are broken up into smaller pieces that a single executor can handle.
+Executors that get chunks with little or no data can just quickly process them and move on to do some real work.
+
+
+### Advanced use cases
+
+For advanced use cases you don't have to use provided spark application directly. `sql-delta-import`
+libraries can be imported into your own project. You can specify custom data transformations or JDBC dialect to gain a
+more precised control of data type handling
+
+```scala
+import org.apache.spark.sql._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+
+import io.delta.connectors.spark.JDBC._
+
+implicit val spark: SparkSession = SparkSession.builder().master("local").getOrCreate()
+
+
+// All additional possible jdbc connector properties described here - https://dev.mysql.com/doc/connector-j/8.0/en/connector-j-reference-configuration-properties.html
+val jdbcUrl = "jdbc:mysql://hostName:port/database"
+
+val config = ImportConfig(source = "table", destination = "target_database.table", splitBy = "id", chunks = 10)
+
+// a sample transform to convert all timestamp columns to strings
+val timeStampsToStrings : DataFrame => DataFrame = source => {
+  val tsCols = source.schema.fields.filter(_.dataType == DataTypes.TimestampType).map(_.name)
+   tsCols.foldLeft(source)((df, colName) =>
+     df.withColumn(colName, from_unixtime(unix_timestamp(col(colName)), "yyyy-MM-dd HH:mm:ss.S")))
+}
+
+// Whatever functions are passed to below transform will be applied during import
+val transforms = new DataTransform(Seq(
+  df => df.withColumn("id", col("id").cast(types.StringType)), //custom function to cast id column to string
+  timeStampsToStrings //included transform function converts all Timestamp columns to their string representation
+))
+
+val importer = new JDBCImport(jdbcUrl = jdbcUrl, importConfig = config, dataTransform = transforms)
+
+importer.run()
+```
+
+Prior to migrating to Databricks Lakehouse Platform we had roughly 300 `sqoop` jobs. We were able to
+successfully port all of them to `sql-delta-import`. Today they happily coexist in production with other spark
+jobs allowing us to use uniform set of tools for orchestrating, scheduling, monitoring and logging for all of our jobs.
+
+If you're interested in working with Delta Lake, the Databricks platform, or
+enabling really interesting machine learning use-cases, check out our [careers
+page](/careers/#open-positions)!
diff --git a/_posts/2021-03-16-introducing-lucid-a-swift-library-for-building-robust-data-flows.md b/_posts/2021-03-16-introducing-lucid-a-swift-library-for-building-robust-data-flows.md
new file mode 100644
index 0000000..0f42eb1
--- /dev/null
+++ b/_posts/2021-03-16-introducing-lucid-a-swift-library-for-building-robust-data-flows.md
@@ -0,0 +1,82 @@
+---
+layout: post
+title: "Introducing Lucid: A Swift Library For Building Robust Data Flows"
+author: trupin
+tags:
+- swift
+- architecture
+- codegen
+- featured
+team: iOS
+---
+
+Lucid is a Swift library which provides a series of convenient tools for building robust data layers for applications.
+
+We built it with three main ideas in mind:
+
+- **Declarative**: Lucid makes it easy to declare complex data models and provides the tools to use it with plain Swift code.
+- **Modularity**: Use the technologies which suits your data flow the best. Lucid gives you the infrastructure to seamlessly integrate the stack you want to use.
+- **Adaptability**: Built to fit most kinds of standard and non-standard server APIs, Lucid abstracts away server-side structural decisions by providing a universal client-side API.
+
+Today we're happy to open source Lucid so that developers around the world can use it in their own applications.
+
+### Why Lucid?
+
+At Scribd, the iOS application has always been a huge part of our business. As we kept adding new features to it, the codebase became more and more complex to maintain. One of the biggest hurdles we encountered was an inconsistency in how data was handled throughout the app.
+
+We decided to tackle this issue by providing a series of tools that would help us handle all of our data flows throughout the app with a single uniform API. This is how Lucid was born.
+
+Our entire iOS codebase is now migrated to using Lucid for things like fetching data from the servers, storing data to disk, resolving read/write conflicts for data stored locally and remotely, listening to local data changes, etc…
+
+### What features does Lucid provide?
+
+The first thing to think about when working with Lucid are the data models. Lucid lets you define your data models in the form of JSON files. Those files are then interpreted by Lucid's command line tool, which generates all of the boilerplate code you'll need to handle those models in your application.
+
+Once imported in your project, here's what the generated code, coupled with Lucid's framework, can help you with:
+
+- Read data from your servers, local storage, or both, with a query system supported by a full featured DSL expressed in Swift.
+- Write data to your servers, or local storage, or both.
+- Listen to local data changes based on queries.
+- Standard store implementations using CoreData, in-memory caching, and disk caching.
+- Easily create and use your own stores.
+- Adaptable to most backend APIs, even those serving data in form of a tree or graph.
+- Automatic model relationship(s) fetching.
+- External plugin support for code generation.
+
+### The Design
+
+![Architecture Diagram](https://docs.google.com/drawings/d/e/2PACX-1vQ1BkNqPJO6dlox3AyQAN2MD066GLZVr7B7MCCldmI1Et-Xnlqzzr5Yxw0_OS5VaDAW3O6jCPILvlj_/pub?w=960&h=720)
+
+Lucid let you use two main types of objects:
+
+- **Entity objects**, which are automatically generated from their respective JSON description files. They represent your data.
+- **Manager objects**, which provide a uniform API to read/write data to multiple locations.
+
+Internally, each manager interacts with as many stores as needed. There are two types of stores:
+
+- **Remote stores**. They represent the servers and directly talk to them via HTTP.
+- **Local stores**. They represent a local source of data such as a key/value cache, a Core Data database, etc...
+
+In short, managers are in charge of synchronizing the data between the stores. Stores are in charge of bridging the data to specific technologies.
+
+There is much more to discover about Lucid in the [documentation](https://github.com/scribd/Lucid/tree/master/Documentation/Manual).
+
+### Who is Lucid for?
+
+Lucid is for developers who don't want to recreate the wheel every time they need to read/write data in their application. With  Lucid, you are able to declare your data models once, then chose whichever built-in functionality you need to build a robust data flow.
+
+Lucid was designed to let you focus on data modeling rather than implementation details. For example, if you decide you want to store your data to disk, you just need to add a single line to your object's JSON description.
+
+### Where can I find Lucid?
+
+Lucid is available on [Github](https://github.com/scribd/Lucid) under the MIT license.
+
+If you like Lucid, you might like other open source projects we developed at Scribd which you can find on our [Github page](https://github.com/scribd).
+
+### Can I contribute?
+
+You are more than welcome to contribute to Lucid. You can open a PR or file issues on [Github](https://github.com/scribd/Lucid). Please refer to our [contributions guidelines](https://github.com/scribd/Lucid/blob/master/CONTRIBUTING.md) before doing so.
+
+
+If you're interested in building great mobile applications with us check out our
+[careers page](/careers/#open-positions)!
diff --git a/_posts/2021-03-18-faster-fargate-deploys.md b/_posts/2021-03-18-faster-fargate-deploys.md
new file mode 100644
index 0000000..a6e8b2f
--- /dev/null
+++ b/_posts/2021-03-18-faster-fargate-deploys.md
@@ -0,0 +1,60 @@
+---
+layout: post
+title:  "Speeding up ECS Fargate deployments"
+author: nakulpathak3
+tags:
+- aws
+- deploys
+- featured
+team: Internal Tools
+---
+
+Scribd moved its monolith to AWS in April 2020 and as part of the migration, we had to design and implement a deployment pipeline for our new (and *shiny*) [ECS Fargate](https://aws.amazon.com/fargate/) infrastructure. In this post, we'll share how we improved our deployment speeds from ~40 minutes to less than 20 minutes.
+
+### Original Implementation
+
+Our starting implementation involved a few steps:
+- Deploying assets via [Capistrano](https://capistranorb.com/) to our asset-hosting servers *(2.5 minutes)*
+- Executing a Fargate task to run any database migrations *(3 minutes)*
+- Restarting and waiting on ~500 Fargate tasks via the AWS CLI *(32-35 minutes)*
+
+### Improvements
+
+#### Fargate Service Updates
+By far, the slowest part of our deployment was waiting for ECS services to finish updating. We use the default rolling deployment which stops and starts tasks to trigger a re-pulling of the freshly-uploaded [ECR](https://aws.amazon.com/ecr/) image. Here are some changes we implemented -
+
+* **Docker Image Size Reduction** - The first thing everyone thinks of when considering ECS Fargate speedups is how to reduce the image pull time since Fargate (unlike EC2) [has no image caching](https://github.com/aws/containers-roadmap/issues/696). However, unless you can drastically reduce your image size (think 1Gb to 100Mb), this will not lead to significant time reductions. We reduced our compressed image size from ~900Mb to ~700Mb and it led to **little to no improvement**. It did lead to a cleaner image but that wasn't our initial goal.
+
+* [**Deregistration Delay**](https://docs.aws.amazon.com/elasticloadbalancing/latest/application/load-balancer-target-groups.html#deregistration-delay) - This is a property on a load balancer's target group that dictates how long a task stays in *Draining* state after it stops receiving requests. We looked at Datadog APM for the p99 of our longest running request and set the delay to 17s from the **default of 300s**. This reduced service refreshes to ~22 minutes.
+
+* **ECS Throttling** - During deployments, we investigated the "Events" tab of our main web ECS service. There were events with the following messages -
+  - *"service production-web operations are being throttled on elb. Will try again later."*
+  - *"service production-web operations are being throttled. Will try again later."*
+
+  Due to Scribd's high Fargate task volume, the number of start and stop requests we were making was too high for AWS' default limits. We opened support tickets with the ELB and Fargate teams to get those limits increased. This further reduced service deploy time to 16-18 minutes.
+
+* **Network Load Balancer Health Checks** - From testing in staging, we noticed that reducing our network load balancer's [health-check intervals and thresholds](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/target-group-health-checks.html) helped reduce staging deploy time from ~9 to ~6 minutes. However, it only translated to 1-2 minutes saved in production with much higher number of ECS tasks. You do want to be careful with the value to avoid false-positive health checks and keep in mind that updating these values requires re-creation of the ECS service the load balancer points to.
+
+#### Asset Deployment Improvements
+Our asset deployments were run using Capistrano. The job `ssh`-ed onto our asset servers, ran a series of [Rake tasks]((https://guides.rubyonrails.org/v4.2/command_line.html#rake)) to download, unzip, and correctly place assets. There were some issues with this approach -
+* Dependency on Capistrano gem forced us to use the monolith Docker image as the job's base image
+* Running Rake tasks required loading the application which adds time to the job
+* Our ECS service refresh job runs `docker push/pull` tasks to upload the latest image to ECR. This forced us to have separate jobs for asset and service deployments to avoid adding a Docker dependency to the monolith image.
+
+To resolve these issues, we decided to remove Capistrano & Rake as dependencies and wrote pure Ruby and Bash code to perform the tasks. This unified the two jobs and brought asset deploy time from 2.5 minutes to 30s.
+
+#### Database Migration
+In our case, running a database migration task in Fargate involved starting a new task instance of our `database_migration` task family. Due to Fargate startup slowness, this task would take 3 minutes to run a simple `bundle exec rails db:migrate`.
+
+To resolve this, we used `git` and [Gitlab environments](https://docs.gitlab.com/ee/api/environments.html#get-a-specific-environment) to look for modified files in the `db/migrate` directory. If none were found, we would skip running the migration task. Since majority of our deployments don't run database migration tasks, this shaved off 3 minutes from most jobs.
+```
+env_json=$(curl --silent --header "PRIVATE-TOKEN: <your_access_token>" "<gitlab-repository-path>/environments/<id>")
+last_deployment_sha=$(echo $env_json | jq -r '.last_deployment.sha')
+git diff --name-only $CI_COMMIT_SHA $last_deployment_sha | grep db/migrate
+```
+
+#### Other things to look for
+If you run sidecar containers like Datadog, make sure that you're providing enough memory and CPU to those containers to avoid waiting on them to be ready while your main container has already started.
+
+
+We hope this helps you speed up your deployments and gain greater efficiency!
diff --git a/_posts/2021-04-12-embedding-based-retrieval-scribd.md b/_posts/2021-04-12-embedding-based-retrieval-scribd.md
new file mode 100644
index 0000000..e31f210
--- /dev/null
+++ b/_posts/2021-04-12-embedding-based-retrieval-scribd.md
@@ -0,0 +1,288 @@
+---
+layout: post
+title:  "Embedding-based Retrieval at Scribd"
+author: div
+tags:
+- machinelearning
+- real-time
+- search
+- featured
+team: Recommendations
+---
+
+Building recommendations systems like those implemented at large companies like 
+[Facebook](https://arxiv.org/pdf/2006.11632.pdf) and 
+[Pinterest](https://labs.pinterest.com/user/themes/pin_labs/assets/paper/pintext-kdd2019.pdf) 
+can be accomplished using off the shelf tools like Elasticsearch. Many modern recommendation systems implement 
+*embedding-based retrieval*, a technique that uses embeddings to represent documents, and then converts the 
+recommendations retrieval problem into a [similarity search](https://en.wikipedia.org/wiki/Similarity_search) problem 
+in the embedding space. This post details our approach to “embedding-based retrieval” with Elasticsearch.
+
+### Context
+Recommendations plays an integral part in helping users discover content that delights them on the Scribd platform, 
+which hosts millions of premium ebooks, audiobooks, etc along with over a hundred million user uploaded items.
+
+![](/post-images/2021-04-ebr-scribd/f1.png)
+
+*Figure One: An example of a row on Scribd’s home page that is generated by the recommendations service*
+
+Currently, Scribd uses a collaborative filtering based approach to recommend content, but this model limits our ability 
+to personalize recommendations for each user. This is our primary motivation for rethinking the way we recommend content, 
+and has resulted in us shifting to [Transformer](http://jalammar.github.io/illustrated-transformer/) -based sequential 
+recommendations. While model architecture and details won’t be discussed in this post, the key takeaway is that our 
+implementation outputs *embeddings* – vector representations of items and users that capture semantic information such 
+as the genre of an audiobook or the reading preferences of a user. Thus, the challenge is now how to utilize these 
+millions of embeddings to serve recommendations in an online, reliable, and low-latency manner to users as they 
+use Scribd. We built an embedding-based retrieval system to solve this use case.
+
+### Recommendations as a Faceted Search Problem
+There are many technologies capable of performing fast, reliable nearest neighbors search across a large number of 
+document vectors. However, our system has the additional challenge of requiring support for 
+[faceted search](https://en.wikipedia.org/wiki/Faceted_search) – that is, being able to retrieve the most relevant 
+documents over a subset of the corpus defined by user-specific business rules (e.g. language of the item or geographic 
+availability) at query time. At a high level, we desired a system capable of fulfilling the following requirements:
+
+1. The system should be able to prefilter results over one or more given facets. This facet can be defined as a filter 
+over numerical, string, or category fields
+2. The system should support one or more exact distance metrics (e.g. dot product, euclidean distance)
+3. The system should allow updates to data without downtime
+4. The system should be highly available, and be able to respond to a query quickly. We targeted a service-level 
+objective (SLO) with p95 of <100ms
+5. The system should have helpful monitoring and alerting capabilities, or provide support for external solutions
+
+After evaluating several candidates for this system, we found Elasticsearch to be the most suitable for our use case. 
+In addition to satisfying all the requirements above, it has the following benefits:
+
+- Widely used, has a large community, and thorough documentation which allows easier long-term maintenance and onboarding
+- Updating schemas can easily be automated using pre-specified templates, which makes ingesting new data and maintaining 
+indices a breeze
+- Supports custom plugin integrations
+
+However, Elasticsearch also has some drawbacks, the most notable of which is the lack of true in-memory partial updates. 
+This is a dealbreaker if updates to the system happen frequently and in real-time, but our use case only requires support 
+for nightly batch updates, so this is a tradeoff we are willing to accept.
+
+We also looked into a few other systems as potential solutions. While 
+[Open Distro for Elasticsearch](https://opendistro.github.io/for-elasticsearch/) (aka AWS Managed Elasticsearch) was 
+originally considered due to its simplicity in deployment and maintenance, we decided not to move forward with this 
+solution due to its lack of support for prefiltering. [Vespa](https://vespa.ai/) is also a promising candidate that has a 
+bunch of additional useful features, such as true in-memory partial updates, and support for integration with TensorFlow 
+for advanced, ML-based ranking. The reason we did not proceed with Vespa was due to maintenance concerns: deploying to 
+multiple nodes is challenging since EKS support is lacking and documentation is sparse. Additionally, Vespa requires the 
+entire application package containing all indices and their schemas to be deployed at once, which makes working in a 
+distributed fashion (i.e. working with teammates and using a VCS) challenging.
+
+### How to Set Up Elasticsearch as a Faceted Search Solution
+
+![](/post-images/2021-04-ebr-scribd/f2.png)
+
+*Figure Two: A high level diagram illustrating how the Elasticsearch system fetches recommendations*
+
+Elasticsearch stores data as JSON documents within indices, which are logical namespaces with data mappings and shard 
+configurations. For our use case, we defined two indices, a `user_index` and an `item_index`. The former is essentially 
+a key-value store that maps a user ID to a corresponding user embedding. A sample document in the `user_index` looks like:
+
+```
+{"_id": 4243913,
+ "user_embed": [-0.5888184, ..., -0.3882332]}
+```
+
+Notice here we use Elasticsearch’s inbuilt `_id` field rather than creating a custom field. This is so we can fetch user 
+embeddings with a `GET` request rather than having to search for them, like this:
+
+```
+curl <URL to cluster>:9200/user_index/_doc/4243913
+```
+
+Now that we have the user embedding, we can use it to query the `item_index`, which stores each item’s metadata 
+(which we will use to perform faceted search) and embedding. Here’s what a document in this index could look like:
+
+```
+{"_id": 13375,
+ "item_format": "audiobook",
+ "language": "english",
+ "country": "Australia",
+ "categories": ["comedy", "fiction", "adventure"],
+ "item_embed": [0.51400936,...,0.0892048]}
+```
+
+We want to accomplish two goals in our query: retrieving the most relevant documents to the user (which in our model 
+corresponds to the dot product between the user and item embeddings), and ensuring that all retrieved documents have the 
+same filter values as those requested by the user. This is where Elasticsearch shines:
+
+```
+curl -H 'Content-Type: application/json' \
+<URL to cluster>:9200/item_index/_search \
+-d \ 
+'
+{"_source": ["_id"], 
+ "size": 30, 
+ "query": {"script_score": {"query": {"bool": 
+                                      {"must_not": {"term": {"categories": "adventure"}}, 
+                                       "filter": [{"term": {"language": "english"}},
+                                                  {"term": {"country": "Australia"}}]}},
+                            "script": {"source": "double value = dotProduct(params.user_embed, 'item_embed'); 
+                                                  return Math.max(0, value+10000);", 
+                                       "params": {"user_embed": [-0.5888184, ..., -0.3882332]}}}}}
+'  
+```
+
+Let’s break this query down to understand what’s going on:
+
+1. Line 2: Here we are querying the `item_index` using Elasticsearch’s `_search` API
+2. Lines 5,6: We specify which attributes of the item documents we’d like returned (in this case, only `_id`), and how many 
+results (`30`)
+3. Line 7: Here we are querying using the `script_score` feature; this is what allows us to first prefilter our corpus and 
+then rank the remaining subset using a custom script
+4. Lines 8-10: Elasticsearch has various different boolean query types for filtering. In this example we specify that we 
+are interested only in `english` items that can be viewed in `Australia` and which are not categorized as `adventure` 
+5. Lines 11-12: Here is where we get to define our custom script. Elasticsearch has a built-in `dot_product` method we 
+can employ, which is optimized to speed up computation. Note that our embeddings are not normalized, and Elasticsearch 
+prohibits negative scores. For this reason, we had to include the score transformation in line 12 to ensure our scores 
+were positive
+6. Line 13: Here we can add parameters which are passed to the scoring script
+
+This query will retrieve recommendations based on one set of filters. However, in addition to user filters, each row on 
+Scribd’s homepage also has row-specific filters (for example, the row “Audiobooks Recommended for You” would have a 
+row-specific filter of `"item_format": "audiobook"`). Rather than making multiple queries to the Elasticsearch cluster 
+with each combination of user and row filters, we can conduct multiple independent searches in a single query using the 
+`_msearch` API. The following example query generates recommendations for hypothetical “Audiobooks Recommended for You” 
+and “Comedy Titles Recommended for You” rows:
+
+```
+curl -H 'Content-Type: application/json' \
+<URL to cluster>:9200/_msearch \
+-d \ 
+'
+{"index": "item_index"}
+{"_source": ["_id"], 
+ "size": 30, 
+ "query": {"script_score": {"query": {"bool": 
+                                      {"must_not": {"term": {"categories": "adventure"}}, 
+                                       "filter": [{"term": {"language": "english"}},
+                                                  {"term": {"item_format": "audiobook"}},
+                                                  {"term": {"country": "Australia"}}]}},
+                            "script": {"source": "double value = dotProduct(params.user_embed, 'item_embed'); 
+                                                  return Math.max(0, value+10000);", 
+                                       "params": {"user_embed": [-0.5888184, ..., -0.3882332]}}}}}
+{"index": "item_index"}
+{"_source": ["_id"], 
+ "size": 30, 
+ "query": {"script_score": {"query": {"bool": 
+                                      {"must_not": {"term": {"categories": "adventure"}}, 
+                                       "filter": [{"term": {"language": "english"}},
+                                                  {"term": {"categories": "comedy"}},
+                                                  {"term": {"country": "Australia"}}]}},
+                            "script": {"source": "double value = dotProduct(params.user_embed, 'item_embed'); 
+                                                  return Math.max(0, value+10000);", 
+                                       "params": {"user_embed": [-0.5888184, ..., -0.3882332]}}}}}
+' 
+```
+
+#### Shard Configuration
+Elasticsearch stores multiple copies of data across multiple nodes for resilience and increased query performance in a 
+process known as sharding. The number of primary and replica shards is configurable only at index creation time. Here are 
+some things to consider regarding shards:
+
+1. Try out various shard configurations to see what works best for each use case. 
+[Elastic](https://www.elastic.co/blog/how-many-shards-should-i-have-in-my-elasticsearch-cluster) recommends 20-40GB of data per 
+shard, while [eBay](https://tech.ebayinc.com/engineering/elasticsearch-performance-tuning-practice-at-ebay/) likes to keep 
+their shard size below 30GB. However, these values did not work for us, and we found much smaller shard sizes (<5GB) to 
+boost performance in the form of reduced latency at query time.
+2. When updating data, do not update documents within the existing index. Instead, create a new index, ingest updated 
+documents into this index, and [re-alias](https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-aliases.html) 
+from the old index to the new one. This process will allow you to retain older data in case an update needs to be reverted, 
+and allows re-configurability of shards at update time.
+
+#### Cluster Configuration
+We deployed our cluster across multiple data and primary nodes to enjoy the benefits of data redundancy, increased 
+availability, and the ability to scale horizontally as our service grows. We found that deploying the cluster across 
+multiple availability zones results in an increased latency during query time, but this is a tradeoff we accepted in the 
+interest of availability.
+
+As for hardware specifics, we use AWS EC2 instances to host our cluster. In production, we have 3 `t3a.small` primary-only 
+nodes, 3 `c5d.12xlarge` data nodes, and 1 `t3.micro` Kibana node. The primary-only nodes are utilized only in a coordinating 
+role (to route requests, distribute bulk indexing, etc), essentially acting as smart load balancers. This is why these 
+nodes are much smaller than the data nodes, which handle the bulk of storage and computational costs. Kibana is a data 
+visualization and monitoring tool; however, in production we use Datadog for our monitoring and alerting responsibilities, 
+which is why we do not allocate many resources for the Kibana node.
+
+### What Generating Recommendations Looks Like
+
+![](/post-images/2021-04-ebr-scribd/f3.png)
+
+*Figure Three: a diagram illustrating the system design for Personalization’s Embedding-based Retrieval Service*
+
+Step by step, this is how recommendations are generated for a user when he/she requests the home page:
+1. The Scribd app passes the user’s information to the recommendations service
+2. The recommendations service queries Elasticsearch with the user’s ID to retrieve their user embedding, which is stored 
+in a user index
+3. The recommendations service once again queries Elasticsearch, this time with the user’s embedding along with their 
+user query filters. This query is a multi-search request to the item index: one for every desired row
+4. Elasticsearch returns these recommendations to the service, which are postprocessed and generated into rows before 
+being sent to the client
+5. The client renders these recommendations and displays them to the user
+
+With this approach, Elasticsearch will serve two purposes: acting as a key-value store, and retrieving recommendations. 
+Elasticsearch is, of course, a slower key-value store than traditional databases, but we found the increase in latency 
+to be insignificant (~5ms) for our use case. Furthermore, the benefit of this approach is that it only requires 
+maintaining data in one system; using multiple systems to store data would create a consistency challenge.
+
+The underlying Personalization model is very large, making retraining it a very expensive process. Thus, it needs to be 
+retrained often enough to account for factors like user preference drift but not too often so as to be efficient with 
+computational resources. We found retraining the model weekly worked well for us. item embeddings, which typically update 
+only incrementally, are also recomputed weekly. However, user embeddings are recomputed daily to provide fresh 
+recommendations based on changing user interests. These embeddings along with relevant metadata are ingested into the 
+Elasticsearch index in a batch process using [Apache Spark](https://spark.apache.org/) and are scheduled through 
+[Apache Airflow](https://airflow.apache.org/). We monitor this ingest process along with real-time serving metrics 
+through Datadog.
+
+#### Load Testing
+Our primary goal during load testing was to ensure that our system was able to reliably respond to a “reasonably large” 
+number of requests per second and deliver a sufficient number of relevant recommendations, even under the confines of 
+multiple facets within each query. We also took this opportunity to experiment with various aspects of our system to 
+understand their impact on performance. These include:
+- Shard and replica configuration: We found that increasing the number of shards increased performance, but only to a 
+point; If a cluster is over-sharded, the overhead of each shard outweighs the marginal performance gain of the 
+additional partition
+- Dataset size: We artificially increased the size of our corpus several times to ensure the system’s performance would remain 
+sufficient even as our catalog continues to grow
+- Filter and mapping configurations: Some filters (like `range` inequalities) are more expensive than traditional 
+categorical filters. Additionally, increasing the number of fields in each document also has a negative impact on latency. 
+Our use case calls for several filters across hundreds of document fields, so we played with several document and query 
+configuration to find the one most optimal for the performance of our system
+
+Our system is currently deployed to production and serves ~50rps with a p95 latency <60ms.
+
+### Results
+Using Scribd’s internal A/B testing platform, we conducted an experiment comparing the existing recommendations service 
+with the new personalization model with embedding-based retrieval architecture across the home and discover page surfaces. 
+The test ran for approximately a month with >1M Scribd users (trialers or subscribers) assigned as participants. After 
+careful analysis of results, we saw the following statistically significant (p<0.01) improvements in the personalization 
+variant compared to the control experience:
+- Increase in the number of users who clicked on a recommended item
+- Increase in the average number of clicks per user
+- Increase in the number of users with a read time of at least 10 minutes (in a three day window)
+
+These increases represent significant business impact on key performance metrics. The personalization model currently 
+generates recommendations for every (signed in) Scribd user’s home and discover pages.
+
+### Next Steps
+Now that the infrastructure and model are in place, we are looking to add a slew of improvements to the existing system. 
+Our immediate efforts will focus on expanding the scope of this system to include more surfaces and row modules within 
+the Scribd experience. Additional long term projects include the addition of an online contextual reranker to increase 
+the relevance and freshness of recommendations and potentially integrating our system with an infrastructure as code 
+tool to more easily manage and scale compute resources.
+
+Thank you for reading! We hope you found this post useful and informative.
+
+### Thank You 🌮
+Thank you to 
+[Snehal Mistry](https://www.linkedin.com/in/snehal-mistry-b986b53/), 
+[Jeffrey Nguyen](https://www.linkedin.com/in/jnguyenfc/), 
+[Natalie Medvedchuk](https://www.linkedin.com/in/natalie-medvedchuk/), 
+[Dimitri Theoharatos](https://www.linkedin.com/in/dimitri-theoharatos/), 
+[Adrian Lienhard](https://www.linkedin.com/in/adrianlienhard/), 
+and countless others, all of whom provided invaluable guidance and assistance throughout this project.
+
+(giving tacos 🌮 is how we show appreciation here at Scribd)
diff --git a/_posts/2021-04-26-integrating-airflow-and-okta.md b/_posts/2021-04-26-integrating-airflow-and-okta.md
new file mode 100644
index 0000000..b9d19cb
--- /dev/null
+++ b/_posts/2021-04-26-integrating-airflow-and-okta.md
@@ -0,0 +1,122 @@
+---
+layout: post
+title:  "Integrating Airflow with Okta"
+author: kuntalb
+tags:
+- okta
+- airflow
+- featured
+team: Core Platform
+---
+
+
+At Scribd we use Airflow as a scheduler for most of our batch workloads, 
+this blog is not about Airflow so we are not getting into why Airflow. 
+This is about one of the biggest challenge that we faced while using Airflow and finally conquer. 
+That is how to do authentication and authorisation for Airflow. 
+Of course Airflow does support LDAP and at Scribd we started using LDAP with Airflow initially, 
+but as the organisation grow and more and more user started using Airflow, 
+it became imperative that we integrate Airflow with our SSO provider that is Okta. 
+
+Sadly there is a lack of resources on how to implement airflow with Okta specifically. 
+This write up will describe the journey of integrating Airflow with Okta from the earlier LDAP setup.
+
+
+## Prerequisite
+This section will describe the minimum setup that will require to enable this integration. 
+1. Okta with [API Access Management](https://developer.okta.com/docs/concepts/api-access-management/) enabled. 
+Without this feature enabled in OKTA we will not be able to integrate Airflow with Okta
+
+We are going to use Flask app builder along with some additional packages to integrate it via Okta.
+In Scribd we use a custom build docker image for Airflow, we install the following libraries in that docker image to make Airflow integration work with Okta
+1. [Flask-AppBuilder
+   3.2.2](https://github.com/dpgaspar/Flask-AppBuilder/tree/v3.2.2). Official
+   Airflow repo has a
+   [constraint](https://github.com/apache/airflow/blob/master/setup.cfg#L97) on
+   `flask-appbuilder~=3.1,>=3.1.1`, so adding this additionally to the docker image helps us bypass that constraint
+1. `sqlalchemy>=1.3.18, <1.4.0` --> This is because of some python dependency for Flask-AppBuilder
+1. `authlib==0.15.3` --> authlib needs to installed along with Airflow to enable flask-appbuilder integration with Okta via OIDC
+
+## Okta Setup
+
+![Sample Okta Setup](/post-images/2021-04-okta-airflow/sample-okta-setup.png)
+<font size="3"><center><i>Sample Okta Setup </i></center></font>
+
+1. Create an OIDC Web application. Give it a name and leave the values under the “Configure OpenID Connect” section empty.
+1. Make note of the Client ID and the Client Secret, as you will need them for configuring the airflow webserver.
+1. In the “Allowed Grant Types” section, make sure you check all of the boxes.
+1. For the Login redirect URIs field, you will enter: `https://your-airflow-url-goes-here.com/oauth-authorized/okta`
+1. For the Initiate login URI field, you will enter: `https://your-airflow-url-goes-here.com/login`
+
+## Airflow Configuration
+
+`conf/webserver_config.py`
+
+    AUTH_TYPE = AUTH_OAUTH
+    OAUTH_PROVIDERS = [
+    {'name': 'okta', 'icon': 'fa-circle-o',
+            'token_key': 'access_token',
+            'remote_app': {
+                'client_id': <<>>,
+                'client_secret': <<>>,
+                'api_base_url': 'https://<<okta_url>>/oauth2/v1/',
+                'client_kwargs': {
+                    'scope': 'openid profile email groups'
+                },
+                'access_token_url': 'https://<<okta_url>>/oauth2/v1/token',
+                'authorize_url': 'https://<<okta_url>>/oauth2/v1/authorize',
+        }
+        }
+    ]
+
+A special thanks to Greg Reznik for handling everything related to Okta configuration
+
+### Special Steps
+
+1. We started with Flask-AppBuilder 3.2.1, however it had a bug that needs to
+   be fixed, we raised a [PR for Flask-AppBuilder](https://github.com/dpgaspar/Flask-AppBuilder/pull/1589) to resolve that issue. That PR got merged and now we can use the new release, Flask-AppBuilder 3.2.2
+
+2. As we were migrating from LDAP, we will already have user info populated,
+   however Okta generates a new user id something like
+   this `okta_00u1046sqzJprt1hZ4x6`, but as the email id corresponding to that
+   user id is already present we got the below error. To prevent this we logged
+   into the underlying database for Airflow and cleaned up the `ab_user` and
+   `ab_user_role` table and let Okta integration recreate the user during first
+   sign up.
+
+    ```
+    [2021-03-19 16:32:28,559] {manager.py:215} ERROR - Error adding new user to database. (sqlite3.IntegrityError) UNIQUE constraint failed: ab_user.email
+    [SQL: INSERT INTO ab_user (first_name, last_name, username, password, active, email, last_login, login_count, fail_login_count, created_on, changed_on, created_by_fk, changed_by_fk) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)]
+    [2021-03-19 16:32:28,560] {manager.py:1321} ERROR - Error creating a new OAuth user okta_00u1046sqzJprt1hZ4x6
+    ```
+3. Because we have deleted all the existing user and role, once the users logged in for the first time,
+   especially for the first admin user we did the following from the airflow cli. 
+   This will create the first admin user after that if needed we can propagate other user and roles from the Airflow web console from this admin user account.
+    ```
+     airflow users add-role -r Admin -u okta_00u1046sqzJprt1hZ4x6
+    ```
+
+## Known Issue
+
+1. Currently in the audit log, any action triggered on Airflow has Okta user id. Airflow needs to be patched to write out audit log entries with human readable user identifiers instead.
+
+## Final Stage
+
+Once the setup is complete, you will find the similar tiles on your okta dashboard,
+
+![Sample Okta Tiles](/post-images/2021-04-okta-airflow/okta-tiles.png)
+<font size="3"><center><i>Sample Okta Tiles </i></center></font>
+
+Once you select the tiles, it should redirect you to the below page
+
+![Sample Okta Login Page](/post-images/2021-04-okta-airflow/airflow-login.png)
+<font size="3"><center><i>Okta Login Page </i></center></font>
+
+Hope this doc will help you integrating Okta with Airflow, This journey was a bit tricky one for us but we finally make it happen and we do hope that this doc will help a lot of folks to integrate Airflow with Okta successfully.
+
+---
+
+Within Scribd's Platform Engineering group we have a *lot* more services than
+people, so we're always trying to find new ways to automate our infrastructure.
+If you're interested in helping to build out scalable data platform to help
+change the world reads, [come join us!](/careers/#open-positions)
diff --git a/_posts/2021-05-04-backing-up-data-warehouse.md b/_posts/2021-05-04-backing-up-data-warehouse.md
new file mode 100644
index 0000000..a4af54b
--- /dev/null
+++ b/_posts/2021-05-04-backing-up-data-warehouse.md
@@ -0,0 +1,146 @@
+---
+layout: post
+title:  "Backing up Delta Lake"
+author: kuntalb
+tags:
+- deltalake
+- s3
+- data-warehouse
+- backup
+- featured
+team: Core Platform
+---
+
+
+Transitioning from a more traditional database operation (read ACID, RDBMS blah blah) background to a newer data platform is always interesting. As it constantly challenges all yours year old wisdom and kind of forces you to adapt to newer way of getting things done.
+
+At [Scribd](https://tech.scribd.com/) we have made
+[Delta Lake](https://delta.io/) a cornerstone of our data platform. All data in
+Delta Lake is stored in [Apache Parquet](https://parquet.apache.org/) format enabling Delta Lake to leverage
+the efficient compression and encoding schemes that are native to Parquet. The
+Delta Lake transaction log (also known as the `DeltaLog`) is an ordered record of
+every transaction that has ever been performed on a Delta Lake table since its
+inception. So a particular dataset to work properly it needs to have the
+parquet file and the corresponding `DeltaLog`.
+
+When the task of having a workable backup of all those delta lake files fell
+into my lap, I decided to look some of the age old concepts of backup in a new
+perspective. The concerns I considered were:
+
+  1. What am I protecting against? How much I need to protect?
+  1. Can I survive with loosing some data during restore and do I have the option of rebuilding them again from that point of time recovery?
+  1. What kind of protection I want to put in place for the backed up data?
+
+So what we set as objective as:
+
+  1. I am mainly protecting against human error where by mistake a table can be purged ([VACUUM](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-vacuum.html)), which severely hamper my ability to do a time travel if required.
+  1. In most cases, if we have a reasonable backup ready we should be able to build the Delta table that got lost between the time the backup was taken and a drop table has occurred.
+
+
+## The Devil is in the Details
+
+After deliberating a lot, we decided to do this whole backup operation
+independent of [Delta Lake](https://delta.io/) and go to the lowest layer
+possible, in our case which was S3. I never thought I would say this ever in my
+life (being a RDBMS DBA) but the moment we get onto S3 layer, the whole thing
+become a challenge of copying few S3 buckets (read millions of files) over
+instead of a database backup.
+So we started looking for an efficient S3 copy operation and found [AWS S3
+batch
+operation](https://docs.aws.amazon.com/AmazonS3/latest/userguide/batch-ops-examples-xcopy.html)
+and its feature for Copying objects across AWS account. This was like match
+made in heaven for us.
+You can use [AWS S3 batch operation](https://docs.aws.amazon.com/AmazonS3/latest/userguide/batch-ops-examples-xcopy.html) to perform large-scale batch operations on Amazon S3 objects. S3 Batch Operations can perform a single operation on lists of Amazon S3 objects that you specify. A single job can perform a specified operation (in our case copy) on billions of objects containing large set of data. This operation has following features,
+
+  1. Automatically tracks progress.
+  1. Stores a detailed completion report of all or selected actions in a user defined bucket.
+  1. Provides a fully managed, auditable, and serverless experience.
+
+Once we decided to use  [AWS S3 batch operation](https://docs.aws.amazon.com/AmazonS3/latest/userguide/batch-ops-examples-xcopy.html), the next biggest challenge was how to generate the inventory list that will feed the AWS S3 batch operation. We decided to use [AWS S3 inventory](https://docs.aws.amazon.com/AmazonS3/latest/userguide/storage-inventory.html) to generate the inventory list. There are some challenges associated with that as well.
+
+**Pros**:
+
+* Simple setup, we can terraform it easily
+* Much more efficient operation compare to generating our list as that list object API only returns 1000 rows per call that means we have to keep iterating till we get the full list.
+
+**Cons**:
+
+* We do not control when it can be run, it will generate a report on a daily basis but the timings is not in our hand.
+* It runs in an eventually consistent model, i.e. All of your objects might not appear in each inventory list. The inventory list provides eventual consistency for PUTs for both new objects and overwrites, and DELETEs. Inventory lists are a rolling snapshot of bucket items, which are eventually consistent (that is, the list might not include recently added or deleted objects)
+
+To overcome the downsides, we decided to run the backup at a later date, e.g. for a backup of March 31st we based that off a manifest generated on April 2nd. This manifest would certainly have all data up until March 31st and some of April 1st's files as well.
+
+Once we have settled on this model, the rest of the work was similar to any
+other backup process. We also set up the Source and the Destination to have
+protective boundaries so that we don't accidentally propogate any deletes to
+the backups.
+
+### New Account New Beginning
+
+To stop this accidental deletion of the backed up data we decided to put the
+backed up data set in completely separate bucket in a different AWS account
+with stringent access controls in place. With the new account it was much easier to
+control the access level from the beginning rather than controlling access in
+an already existing account where people already have certain degree of access
+and hard to modify that access levels. In the new account we ensured only a few handful of people will actually have
+access to backed up data, further reducing chances of any manual error.
+
+### Backup Process
+
+#### Destination Side
+
+  1. Backup will be taken on a complete separate AWS account from the source
+     account. Hardly few admin will have access to this account to reduce the
+     chance of manual mistake.
+  1. The whole backup process will be automated with less human intervention to reduce the scope of manual error.
+  1. Destination Side we will have to create buckets to store the inventory reports based on which the batch job will be run.
+  1. Destination Side we will have to create buckets to store the actual backup
+     where the batch job will store the backup objects. While terraforming it
+     we have that bucket name dynamically created with the date appended at the
+     end of the bucket name e.g. `<Source-Bucket-Name>-<dd-mmm-yyyy>`, so that
+     before each full snapshot we can create this buckets. Otherwise there is a
+     risk of earlier full snapshots getting overwritten.
+  1. Create an IAM role for the batch operation, source will give the copy object permission to this role
+  1. We created a lambda on the destination side to scan through all the `manifest.json` files and create the actual batch operation and run it automatically.
+
+#### Source Side
+
+  1. We terraformed an Inventory Management config for all the buckets listed above in Source side.
+  1. This inventory config will create the inventory in Designated Manifest bucket in the destination account.
+  1. For all the buckets on the source side , we have to add the policy as a bucket level policy to allow the S3 batch operation role created in destination side to do the copy operation
+
+
+### Limitations
+
+These are mainly the limitation of AWS S3 batch operation,
+  1. All source objects must be in one bucket.
+      - This is not a challenge for us as we are going to invoke bucket level copy and create a manifest at bucket level meet this requirement
+  1. All destination objects must be in one bucket.
+      - This is not a challenge for us as we are going to invoke bucket level copy and create a manifest at bucket level meet this requirement
+  1. You must have read permissions for the source bucket and write permissions for the destination bucket.
+      - Again with proper IAM roles for the S3 batch copy operation can manage this
+  1. Objects to be copied can be up to 5 GB in size.
+      - S3 Batch is using put method so its limited up to 5GB. If there is any manual upload of files that is more than 5GB we will skip it. The behaviour is tested and we found that batch operation is throwing the following error and continue with the rest of the operation.
+      ```Some-file-name,,failed,400,InvalidRequest,The specified copy source is larger than the maximum allowable size for a copy source: 5368709120 (Service: Amazon S3; Status Code: 400; Error Code: InvalidRequest; Request ID: FHNW4MF5ZMKBPDQY; S3 Extended Request ID: /uopiITqnCRtR1/W3K6DpeWTiJM36T/14azeNw4q2gBM0yj+r0GwzhmmHAsEMkhNq9v8NK4rcT8=; Proxy: null)```
+
+  1. Copy jobs must be created in the destination region, which is the region you intend to copy the objects to.
+      - Again for our purpose this is what we intended to do any way
+  1. If the buckets are un-versioned, you will overwrite objects with the same key names.
+      - We will create new buckets for each full snapshots to mitigate this.
+
+## Conclusion
+
+The above approach worked well for our purpose, and if we follow the process
+properly it should suffice for many of our use-cases. This approach can work quite well if like us you do not have
+the luxury of doing a "Stop the World" on your data warehouse writes, and still
+need to have a backup with certain degree of confidence. This method does not
+provide an accurate point on time snapshot due to the “eventually consistent”
+model of manifest generation, but I believe this method covers most of the use-cases for
+any Delta Lake backups.
+
+---
+
+Within Scribd's Platform Engineering group we have a *lot* more services than
+people, so we're always trying to find new ways to automate our infrastructure.
+If you're interested in helping to build out scalable data platform to help
+change the world reads, [come join us!](/careers/#open-positions)
diff --git a/_posts/2021-05-18-growing-the-delta-lake-ecosystem.md b/_posts/2021-05-18-growing-the-delta-lake-ecosystem.md
new file mode 100644
index 0000000..4479cbc
--- /dev/null
+++ b/_posts/2021-05-18-growing-the-delta-lake-ecosystem.md
@@ -0,0 +1,99 @@
+---
+layout: post
+title: "Growing the Delta Lake ecosystem with Rust and Python"
+tags:
+- featured
+- rust
+- deltalake
+- python
+author: rtyler
+team: Core Platform
+---
+
+
+Scribd stores billions of records in [Delta Lake](https://delta.io) but writing
+or reading that data had been constrained to a single tech stack, all of that
+changed with the creation of [delta-rs](https://github.com/delta-io/delta-rs).
+Historically using Delta Lake required applications to be implemented with or
+accompanied by [Apache Spark](https://spark.apache.org). Many of our batch
+and streaming data processing applications are all Spark-based, but that's not
+everything that exists! In mid-2020 it became clear that Delta Lake would be a
+powerful tool in areas adjacent to the domain that Spark occupies. From my
+perspective, I figured that would soon need to bring data into and out of Delta
+Lake in dozens of different ways. Some discussions and prototyping led to the
+creation of "delta-rs", a Delta Lake client written in Rust that can be easily
+embedded in other languages such as
+[Python](https://delta-io.github.io/delta-rs/python), Ruby, NodeJS, and more.
+
+
+The [Delta Lake
+protocol](https://github.com/delta-io/delta/blob/master/PROTOCOL.md) is not
+_that_ complicated as it turns out. At an extremely high level, Delta Lake is a
+JSON-based transaction log coupled with [Apache
+Parquet](https://parquet.apache.org) files stored on disk/object storage.  This means the core implementation of Delta in [Rust](https://rust-lang.org) is similarly quite simple. Take the following example from our integration tests which "opens" a table, reads it's transaction log and provides a list of Parquet files contained within:
+
+
+```rust
+let table = deltalake::open_table("./tests/data/delta-0.2.0")
+    .await
+    .unwrap();
+assert_eq!(
+    table.get_files(),
+    vec![
+        "part-00000-cb6b150b-30b8-4662-ad28-ff32ddab96d2-c000.snappy.parquet",
+        "part-00000-7c2deba3-1994-4fb8-bc07-d46c948aa415-c000.snappy.parquet",
+        "part-00001-c373a5bd-85f0-4758-815e-7eb62007a15c-c000.snappy.parquet",
+    ]
+);
+```
+
+Our primary motivation for delta-rs was to create something which would
+accommodate high-throughput writes to Delta Lake and allow embedding for
+languages like Python and Ruby such that users of those platforms could perform
+light queries and read operations. 
+
+The first notable writer-based application being co-developed with delta-rs is
+[kafka-delta-ingest](https://github.com/delta-io/kafka-delta-ingest). The
+project aims to provide a highly efficient daemon for ingesting
+Kafka-originating data into Delta tables. In Scribd's stack, it will
+effectively bridge JSON flowing into [Apache Kafka](https://kafka.apache.org)
+topics into pre-defined Delta tables, translating a single JSON message into a
+single row in the table.
+
+From the reader standpoint, the Python interface built on top of delta-rs,
+contributed largely by [Florian Valeye](https://github.com/fvaleye) makes
+working with Delta Lake even simpler, and for most architectures you only need
+to run `pip install deltalake`:
+
+```python
+from deltalake import DeltaTable
+from pprint import pprint
+
+if __name__ == '__main__':
+    # Load the Delta Table
+    dt = DeltaTable('s3://delta/golden/data-reader-primitives')
+
+    print(f'Table version: {dt.version()}')
+
+    # List out all the files contained in the table
+    for f in dt.files():
+        print(f' - {f}')
+
+    # Create a Pandas dataframe to execute queries against the table
+    df = dt.to_pyarrow_table().to_pandas()
+    pprint(df.query('as_int % 2 == 1'))
+```
+
+I cannot stress enough how much potential the above Python snippet has for
+machine learning and other Python-based applications at Scribd.  For a number
+of internal applications developers have been launching Spark clusters for the
+sole purpose of reading some data from Delta Lake in order to start their model
+training workloads in Python. With the maturation of the Python `deltalake`
+package, now there is a fast and easy way to load Delta Lake into basic Python
+applications.
+
+
+
+From my perspective, it's only the beginning with [delta-rs](https://github.com/delta-io/delta-rs). Delta Lake is a deceptively simple technology with tremendous potential across the data platform. I will be sharing more about delta-rs at [Data and AI Summit](https://databricks.com/dataaisummit/north-america-2021) on May 27th at 12:10 PDT. I hope you'll join [my session](https://databricks.com/speaker/r-tyler-croy) with your questions about delta-rs and where we're taking it!
+
+
diff --git a/_posts/2021-05-19-kafka-delta-ingest.md b/_posts/2021-05-19-kafka-delta-ingest.md
new file mode 100644
index 0000000..03bad83
--- /dev/null
+++ b/_posts/2021-05-19-kafka-delta-ingest.md
@@ -0,0 +1,195 @@
+---
+layout: post
+title: "Kafka to Delta Lake, as fast as possible"
+tags:
+- featured
+- rust
+- deltalake
+- kafka
+author: christianw
+team: Core Platform
+---
+
+Streaming data from Apache Kafka into Delta Lake is an integral part of
+Scribd's data platform, but has been challenging to manage and
+scale. We use Spark Structured Streaming jobs to read data from
+Kafka topics and write that data into [Delta Lake](https://delta.io) tables. This approach gets the job
+done but in production our experience has convinced us that a different
+approach is necessary to efficiently bring data from Kafka to Delta Lake. To
+serve this need, we created
+[kafka-delta-ingest](https://github.com/delta-io/kafka-delta-ingest).
+
+The user requirements are likely relatable to a lot of folks:
+
+* _My application emits data into Kafka that I want to analyze later._
+* _I want my Kafka data to land in the data warehouse and be queryable pretty soon after ingestion._
+
+Looking around the internet, there are few approaches people will blog about
+but many would either cost too much, be really complicated to setup/maintain,
+or both. Our first Spark-based attempt at solving this problem falls under
+"both."
+
+Spark Structured Streaming is a powerful streaming framework that can easily
+satisfy the requirements described above with a few lines of code (about 70 in
+our case) but the cost profile is pretty high. Despite the relative simplicity
+of the code, the cluster resources necessary are significant.  Many of our
+variable throughput Kafka topics leave us wishing for auto-scaling too.
+
+[Kafka Delta Ingest](https://github.com/delta-io/kafka-delta-ingest) is an open
+source daemon created by Scribd in the [Delta Lake project](https://delta.io)
+with the very specific goal of optimizing the path from Kafka to Delta Lake. By
+focusing on this very specific use-case, we can remove many of the pain points
+we currently experience with our Spark streaming jobs. The daemon is written in
+[Rust](https://rust-lang.org) which has helped us keep the runtime super
+efficient. It is also fully distributed with no coordination between workers,
+meaning no driver node hanging out and a smaller overall infrastructure
+footprint.
+
+## In depth
+
+There is a bit of an impedance mismatch between Kafka streams and data warehouse
+file structure. [Parquet is a columnar
+format](https://parquet.apache.org/documentation/latest/), and each Parquet
+file (in fact each row group within a file) in a Delta Lake table should
+include a lot of rows to enable queries to leverage all the neat optimization
+features of parquet and run as fast as possible. Messages consumed from a Kafka
+topic come in one at a time though. To bridge this mismatch, Kafka Delta Ingest
+spends most of its time buffering messages in memory. It checks a few process
+arguments to make the largest possible parquet files. Those arguments are:
+
+* allowed_latency - the latency allowed between each Delta write
+* max_messages_per_batch - the maximum number of messages/records to include in each Parquet row group within a file
+* min_bytes_per_file - the minimum bytes per parquet file written out by Kafka Delta Ingest
+
+Internally, our internal Kafka usage guidelines include these constraints:
+
+* Messages written to Kafka
+  * Must be JSON
+  * Must include an ISO 8601 timestamp representing when the message was ingested/created (field name is flexible, but this timestamp must be included somewhere in the message)
+
+* Records written to Delta Lake
+  * Must include Kafka metadata
+    * We preserve the metadata fields below under a struct field called `meta.kafka`
+      * topic
+      * partition
+      * offset
+      * timestamp
+  * Must include a date-based partition (e.g. yyyy-MM-dd) derived from the ISO 8601 ingestion timestamp of the message
+
+Other potential users of Kafka Delta Ingest may have different guidelines on how they use Kafka. Because of how we use Kafka internally, the first iteration of Kafka Delta Ingest is very focused on:
+
+* JSON formatted messages
+* Buffer flush triggers that thread the needle between query performance and persistence latency
+* Very basic message transformations to limit the message schema constraints we push up to our producer applications
+
+### Example
+
+Let's say we have an application that writes messages onto a Kafka topic called
+`web_requests` every time it handles an HTTP request. The message schema
+written by the producer application includes fields such as:
+
+* `status`: 200, 404, 500, 302, etc.
+* `method`: `GET`, `POST`, etc.
+* `url`: Requested URL, e.g. `/documents/42`, etc.
+* `meta.producer.timestamp`: an ISO-8601 timestamp representing the time the producer wrote the message.
+
+Many of our tables are partitioned partitioned by a field called `date` which
+has a `yyyy-MM-dd` format. We choose not to force our producer application to
+provide this field explicitly. Instead, we will configure our Kafka Delta
+Ingest stream to perform a transformation of the `meta.producer.timestamp`
+field that the producer already intends to send.
+
+To accomplish this with Kafka Delta Ingest, using the "web_requests" stream as an example, we would:
+
+1. Create the "web_requests" topic
+1. Create the schema for our Delta Lake table:
+    ```
+CREATE TABLE `kafka_delta_ingest`.`web_requests` (
+	`meta` STRUCT<
+		`kafka`: STRUCT<`offset`: BIGINT, `topic`: STRING, `partition`: INT>,
+		`producer`: `timestamp`
+    >,
+	`method` STRING,
+	`status` INT,
+	`url` STRING,
+	`date` STRING
+  )
+USING delta
+PARTITIONED BY (`date`)
+LOCATION 's3://path_to_web_requests_delta_table'
+```
+
+    The Delta Lake schema we create includes more fields than the producer
+    actually sends. Fields not written by the producer include the `meta.kafka`
+    struct and the `date` field.
+
+3. Launch one or more kafka-delta-ingest workers to handle the topic-to-table pipeline:
+    ```
+kafka-delta-ingest ingest web_requests s3://path_to_web_requests_delta_table \
+  -l 60 \
+  -K "auto.offset.reset=earliest" \
+  -t 'date: substr(meta.producer.timestamp, `0`, `10`)' \
+      'meta.kafka.offset: kafka.offset' \
+      'meta.kafka.partition: kafka.partition' \
+      'meta.kafka.topic: kafka.topic'
+```
+
+The parameters passed to the daemon configure the allowed latency, some primitive data augmentation, the source topic, and the destination Delta table. For more detailed documentation, consult the [readme](https://github.com/delta-io/kafka-delta-ingest#readme).
+
+Internally, Kafka Delta Ingest relies on Kafka consumer groups to coordinate
+partition assignment across many workers handling the same topic. If we
+want to scale out the number of workers handling "web_requests" we can just
+launch more ECS tasks with the same configuration and respond to Kafka's
+rebalance events.
+
+The deployment ends up looking like:
+
+![Kafka Delta Ingest Deployment](/post-images/2021-05-kafka-delta-ingest/kafka-delta-ingest-deployment.png)
+
+We have one Kafka Delta Ingest ECS service per topic-to-table ETL workload. Each service runs 24x7. We expect high volume topics to require more worker nodes and scale out and in occassionally, and low volume topics to require a single worker (more on that later).
+
+
+### 💙
+
+My favorite thing about Kafka Delta Ingest is its very narrow scope to optimize
+and replace a _very_ common use case that you _could_ support with Spark
+Structured Streaming, but much less efficiently. Basically, I love that we are
+creating a very specific tool for a very common need.
+
+Compare/contrast of Kafka Delta Ingest vs Spark Structured Streaming:
+
+* Kafka Delta Ingest *ONLY* supports Kafka as a source, whereas Spark Structured Streaming supports generic sources.
+* Kafka Delta Ingest *ONLY* supports Delta Lake as a sink, whereas Spark Structured Streaming supports generic sinks.
+* Kafka Delta Ingest *ONLY* supports JSON messages (so far), whereas Spark Structured Streaming supports a variety of formats.
+* Unlike Spark Structured Streaming, Kafka Delta Ingest *DOES NOT* provide any facility for joining streams or computing aggregates.
+* Kafka Delta Ingest is an application that makes strong assumptions about the source and sink and is only configurable via command line arguments, whereas Spark Structured Streaming is a library that you must write and compile code against to yield a jar that can then be hosted as a job.
+* Kafka Delta Ingest is fully distributed and master-less - there is no "driver" node. Nodes can be spun up on a platform like ECS with little thought to coordination or special platform dependencies.  A Spark Structured Streaming job must be launched on a platform like Databricks or EMR capable of running a Spark cluster.
+
+## Get Involved!
+
+Contributions to Kafka Delta Ingest are very welcome and encouraged. Our core team has been focused on supporting our internal use case so far, but we would love to see Kafka Delta Ingest grow into a more well rounded solution. We have not been using the [GitHub issue list](https://github.com/delta-io/kafka-delta-ingest/issues) for managing work just yet since we are mostly managing work internally until we have our primary workloads fully covered, but we will be paying much more attention to this channel in the very near future.
+
+One especially interesting area for contribution is related to data format. A
+lot of folks are using Avro and Protobuf instead of JSON these days. We use
+JSON on all of our ingestion streams at the moment, but I'd love to see Avro
+and Protobuf support in Kafka Delta Ingest.
+
+Another big contribution would be support for running periodically
+rather than continuously (24x7). I suspect a lot of folks have situations
+where Kafka is used as a buffer between data warehouse writes that
+occur periodically throughout the day. We have several low-volume topics that
+are not a good fit for 24x7 streaming because they only produce one or two
+messages per second. Having a 24x7 process buffer these topics in memory would
+be very awkward. It would make a lot more sense to let these buffer in Kafka
+and launch a periodic cron-style job to do the ETL a few times a day. This is
+similar to the "Trigger Once" capability in [Spark Structured
+Streaming](https://databricks.com/blog/2017/05/22/running-streaming-jobs-day-10x-cost-savings.html).
+
+Another vector for contribution is
+[delta-rs](https://github.com/delta-io/delta-rs). Delta-rs is another Scribd
+sponsored open source project and is a key dependency of kafka-delta-ingest.
+Any write-oriented improvement accepted in delta-rs is Clikely to benefit
+kafka-delta-ingest.
+
+
+Kafka Delta Ingest has a bright future ahead and I hope you'll join us!
diff --git a/_posts/2021-07-08-automate-databricks-with-terraform.md b/_posts/2021-07-08-automate-databricks-with-terraform.md
new file mode 100644
index 0000000..f995b0c
--- /dev/null
+++ b/_posts/2021-07-08-automate-databricks-with-terraform.md
@@ -0,0 +1,51 @@
+---
+layout: post
+title: "Automating Databricks with Terraform"
+team: Core Platform
+author: rtyler
+tags:
+- databricks
+- terraform
+- featured
+---
+
+The long term success of our data platform relies on putting tools into the
+hands of developers and data scientists to “choose their own adventure”. A big
+part of that story has been [Databricks](https://databricks.com) which we
+recently integrated with [Terraform](https://terraform.io) to make it easy to
+scale a top-notch developer experience. At the 2021 Data and AI Summit, Core
+Platform infrastructure engineer [Hamilton
+Hord](https://github.com/HamiltonHord) and Databricks engineer [Serge
+Smertin](https://github.com/nfx) presented on the Databricks terraform provider
+and how it's been used by Scribd.
+
+In the session embedded below, they share the details on the [Databricks (Labs)
+Terraform
+integration](https://github.com/databrickslabs/terraform-provider-databricks)
+and how it can automate literally every aspect required for a production-grade
+platform: data security, permissions, continuous deployment and so on. They
+also discuss the ways in which our Core Platform team enables internal
+customers without acting as gatekeepers for data platform changes. Just about
+anything they might need in Databricks is a pull request away! 
+
+<center>
+<iframe width="560" height="315" src="https://www.youtube-nocookie.com/embed/h8LrVmb4W2Q" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+</center>
+
+
+In hindsight, it's mind-boggling how much manual configuration we had to
+previously maintain. With the Terraform provider for Databricks we can very
+easily test, reproduce, and audit hundreds of different business critical
+Databricks resources. Coupling Terraform with the recent "multi-workspace"
+support that Databricks unveiled in 2020 means we can also now provision an
+entirely new environment in a few hours!
+
+Investing in data platform tools and automation is a key part of the vision for
+Platform Engineering which encompasses Data Engineering, Data Operations, and
+Core Platform. We have a [number of open positions](/careers/#open-positions)
+at the moment, but I wanted to call special attention to the [Data Engineering
+Manager](https://jobs.lever.co/scribd/e2187c1c-a1d6-4b77-bde6-acc997f68156)
+role for which we're currently hiring. The leader of the Data Engineering team
+will help deliver data tools and solutions for internal customers building on
+top of Delta Lake, Databricks, Airflow, and Kafka. Suffice it to say, there's a
+lot of really interesting work to be done!
diff --git a/_posts/2021-07-12-identifying-document-types.md b/_posts/2021-07-12-identifying-document-types.md
new file mode 100644
index 0000000..b8f8fed
--- /dev/null
+++ b/_posts/2021-07-12-identifying-document-types.md
@@ -0,0 +1,131 @@
+---
+layout: post
+title: "Identifying Document Types at Scribd"
+tags:
+- machinelearning
+- data
+- featured
+- kyc-series
+team: Applied Research
+author: jonathanr
+---
+
+
+User-uploaded documents have been a core component of Scribd’s business from
+the very beginning, understanding what is _actually_ in the document corpus
+unlocks exciting new opportunities for discovery and recommendation.
+With Scribd anybody can [upload and share
+documents](https://www.scribd.com/docs), analogous to YouTube and videos. Over
+the years, our document corpus has become larger and more diverse which has
+made understanding it an ever-increasing challenge.
+Over the past year one of the missions of the Applied Research team has been to
+extract key document metadata to enrich
+downstream discovery systems. Our approach combines semantic understanding with
+user behaviour in a multi-component machine learning system.
+
+This is part 1 in a series of blog posts explaining the challenges and
+solutions explored while building this system.  This post presents the
+limitations, challenges, and solutions encountered when developing a model to
+classify arbitrary user-uploaded documents.
+
+
+## Initial Constraints
+
+The document corpus at Scribd stretches far and wide in terms of content, language and structure. An arbitrary document can be anything from math homework to Philippine law to engineering schematics. In the first stage of the document understanding system, we want to exploit visual cues in the documents. Any model used here must be language-agnostic to apply to arbitrary documents. This is analogous to a “first glance” from humans, where we can quickly distinguish a comic book from a business report without having to read any text. To satisfy these requirements, we use a computer vision model to predict the document type. But what is a “type”?
+
+
+
+## Identifying Document Types
+
+A necessary question to ask, but a difficult one to answer –  what kind of documents do we have? As mentioned in the section above, we’re interested in differentiating documents based on visual cues, such as text-heavy versus spreadsheet versus comics. We’re not yet interested in more granular information like fiction VS non-fiction.
+
+Our approach to this challenge was twofold. Firstly, talking to subject matter experts at Scribd on the kinds of documents they have seen in the corpus. This was and continues to be very informative, as they have domain-specific knowledge that we leverage with machine learning. The second solution was to use a data-driven method to explore documents. This consisted of creating embeddings for documents based on their usage. Clustering and plotting these embeddings on an interactive map allowed us to examine document structure in different clusters. Combining these two methods drove the definition of document types. Below is an example of one of these maps we used to explore the corpus.
+
+
+<!-- ![](https://user-images.githubusercontent.com/9146894/124963194-8534b800-dfed-11eb-81e1-c68cf6e4498c.png)*Figure 2: Map of the document corpus built from user-interaction embeddings* -->
+
+<cetner>
+  <figure>
+      <img width="996" alt="Map of the document corpus, built from user-interaction embeddings" src="https://user-images.githubusercontent.com/9146894/124963194-8534b800-dfed-11eb-81e1-c68cf6e4498c.png">
+      <figcaption> Figure 1: Map of the document corpus built from user-interaction embeddings. More on this method in a future post. </figcaption>
+  </figure>
+</cetner>
+
+We converged on 6 document types, which included sheet-music, text-heavy, comics and tables. More importantly, these 6 classes don’t account for every single document in our corpus. While there are many different ways of dealing with out-of-distribution examples in the literature, our approach explicitly added an “other” class to the model and train it. We talk more about its intuition, potential solutions to the problem and challenges faced in the coming sections.
+
+
+## Document Classification
+
+As mentioned in the introduction, we need an approach that is language and content agnostic, meaning that the same model will be appropriate for all documents, whether they contain images, text, or a combination of both. To satisfy these constraints we use a computer vision model to classify individual pages. These predictions can then be combined with other meta-data such as page count or word count to form a prediction for the entire document.
+
+
+### Gathering Labelled Pages and Documents
+
+Before the model training started, we faced an interesting data gathering problem. Our goal is to classify documents, so we must gather labelled documents. However, in order to train the page classifier mentioned above, we must also gather labelled pages. Naively, it might seem appropriate to gather labelled documents and use the document label for each of its pages. This isn't appropriate as a single document can contain multiple types of pages. As an example, consider the pages in this document.
+
+
+<figure>
+    <img width="996" alt="Three pages from the same document" src="https://user-images.githubusercontent.com/9146894/124964050-8adecd80-dfee-11eb-83fb-a3afbde1fc14.png">
+  <figcaption> Figure 2: Three different pages from the same document to demonstrate why we can't take the document label and assign it to each page. </figcaption>
+</figure>
+
+
+The first and third pages can be considered text-heavy, but definitely not the second. Taking all the pages of this document and labelling them as text-heavy would severely pollute our training and testing data. The same logic applies to each of our 6 classes.
+
+To circumvent this challenge, we took an active learning approach to data gathering. We started with a small set of hand-labelled pages for each class and trained binary classifiers iteratively. The binary classification problem is simpler than the multi-class problem, requiring less hand-labelled data to obtain reliable results. At each iteration, we evaluated the most confident and least confident predictions of the model to get a sense of its inductive biases. Judging from these, we supplemented the training data for the next iteration to tweak the inductive biases and have confidence in the resulting model and labels. The sheet music class is a prime example of tweaking inductive biases. Below is an example of a page that can cause a sheet music misclassification if the model learns that sheet music is any page with horizontal lines. Supplementing the training data at each iteration helps get rid of inductive biases like this.
+
+
+<figure>
+    <img width="662" alt="Example of possible sheet music misclassification from wrong inductive bias" src="https://user-images.githubusercontent.com/9146894/124964644-40118580-dfef-11eb-8d24-d6e0a6460ca9.png">
+  <figcaption> Figure 3: Example of possible sheet music misclassification due to wrong inductive biases. </figcaption>
+</figure>
+
+After creating these binary classifiers for each class, we have a large set of reliable labels and classifiers that can be used to gather more data if necessary.
+
+
+### Building a Page Classifier
+
+The page classification problem is very similar to ImageNet classification, so we can leverage pre-trained ImageNet models. We used transfer learning in [fast.ai](https://www.fast.ai/) and [PyTorch](https://pytorch.org/) to fine-tune pre-trained computer vision architectures for the page-classifier. After initial experiments, it was clear that models with very high  ImageNet accuracy, such as EfficientNet, did not perform much better on our dataset. While it’s difficult to pinpoint exactly why this is the case, we believe it is because of the nature of the classification task, the page resolutions and our data.
+
+We found [SqueezeNet](https://arxiv.org/pdf/1602.07360.pdf), a relatively established lightweight architecture, to be the best balance between accuracy and inference time. Because models such as ResNets and DenseNets are so large, they take a lot of time to train and iterate on. However, SqueezeNet is an order of magnitude smaller than these models, which opens up more possibilities in our training scheme. Now we can train the entire model and are not limited to using the pre-trained architecture as a feature-extractor, which is the case for larger models.
+
+
+<figure>
+    <img width="450" alt="Figure 4: SqueezeNet architectures taken from the paper. Left: SqueezeNet; Middle: SqueezeNet with simple bypass; Right: SqueezeNet with complex bypass." src="https://user-images.githubusercontent.com/9146894/124964923-91217980-dfef-11eb-9553-13bf296ced10.png">
+  <figcaption> Figure 4: SqueezeNet architectures taken from the <a href="https://arxiv.org/pdf/1602.07360.pdf">paper</a>. Left: SqueezeNet; Middle: SqueezeNet with simple bypass; Right: SqueezeNet with complex bypass. </figcaption>
+</figure>
+
+
+Additionally, for this particular model, low inference time is key in order to run it on hundreds of millions of documents. Inference time is also directly tied to costs, so an optimal cost/benefit ratio would require significantly higher performance to justify higher processing time.
+
+
+### Ensembled Pages for Document Classification
+
+We now have a model to classify document pages and need to use them to determine a prediction for documents and want to combine these classifications with additional meta-data, such as total page count, page dimensions, etc. However, our experiments here showed that a simple ensemble of the page classifications provided an extremely strong baseline that was difficult to beat with meta-data.
+
+To increase efficiency, we sample 4 pages from the document to ensemble. This way we don’t run into processing issues for documents with thousands of pages. This was chosen based on the performance of the classifier and the page distribution in the document corpus, which empirically verified our assumption that this sample size reasonable represents each document.
+
+
+### Error Analysis and Overconfidence
+
+After error analysis of a large sample of documents from production, we found that some classes were returning overconfident but wrong predictions. This is a very interesting challenge and one that has seen an explosion of academic research recently. To elaborate, we found documents that were predicted wrongly with over 99% confidence scores. A major consequence of this is that it negates the effectiveness of setting a threshold on model output in order to increase precision.
+
+While there are different ways of dealing with this, our approach involved two steps. Firstly, we utilized the “other” class mentioned earlier. By adding many of these adversarial, out-of-distribution examples to the “other” class and re-training the model, we were able to quickly improve metrics without changing model architecture. Secondly, this affected some classes more than others. For these, individual binary classifiers were built to improve precision.
+
+### Where do we go from here?
+
+<figure>
+    <img width="400" alt="Figure 5: Diagram of the overall document understanding system. The red box is what we talked about in this post" src="https://user-images.githubusercontent.com/9146894/124965219-da71c900-dfef-11eb-9d12-4bf9a9772f4c.png">
+  <figcaption> Figure 5: Diagram of the overall document understanding system. The red box is what we talked about in this post </figcaption>
+</figure>
+
+
+Now that we have a model to filter documents based on visual cues, we can build dedicated information extraction models for each document type – sheet music, text-heavy, comics, tables. This is exactly how we proceed from here, and we start with extracting information from text-heavy documents.
+
+[Part 2](/blog/2021/information-extraction-at-scribd.html) in this series will dive deeper into the challenges and solutions our
+team encountered while building these models. If you're interested to learn more about the problems Applied Research is solving or the systems which are built around those solutions, check out [our open positions!](/careers/#open-positions)
+
+
+## References
+
+- [SqueezeNet: AlexNet-Level Accuracy with 50X Fewer Parameters and <0.5MB Model Size](https://arxiv.org/pdf/1602.07360.pdf)
diff --git a/_posts/2021-07-20-growing-delta-ecosystem-with-rust.md b/_posts/2021-07-20-growing-delta-ecosystem-with-rust.md
new file mode 100644
index 0000000..d476d5e
--- /dev/null
+++ b/_posts/2021-07-20-growing-delta-ecosystem-with-rust.md
@@ -0,0 +1,46 @@
+---
+layout: post
+title: "Presenting Rust and Python Support for Delta Lake"
+tags:
+- deltalake
+- databricks
+- featured
+- rust
+author: rtyler
+team: Core Platform
+---
+
+Delta Lake is integral to our data platform which is why we have invested
+heavily in [delta-rs](https://github.com/delta-io/delta-rs) to support our
+non-JVM Delta Lake needs. This year I had the opportunity to share the progress
+of delta-rs at Data and AI Summit. Delta-rs was originally started by my colleague [QP](https://github.com/houqp) just over a year ago and it has now grown to now a multi-company project with numerous contributors, and downstream projects such as [kafka-delta-ingest](/blog/2021/kafka-delta-ingest.html).
+
+
+
+In the session embedded below, I introduce the delta-rs project which is
+helping bring the power of Delta Lake outside of the Spark ecosystem. By
+providing a foundational Delta Lake library in Rust, delta-rs can enable native
+bindings in Python, Ruby, Golang, and more.We will review what functionality
+delta-rs supports in its current Rust and Python APIs and the upcoming roadmap.
+
+I also try to give an overview of one of the first projects to use it in
+production:
+[kafka-delta-ingest](https://github.com/delta-io/kafka-delta-ingest), which
+builds on delta-rs to provide a high throughput service to bring data from
+Kafka into Delta Lake.
+
+
+<center>
+<iframe width="560" height="315" src="https://www.youtube-nocookie.com/embed/scYz12UK-OY" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+</center>
+
+
+Investing in data platform tools and automation is a key part of the vision for
+Platform Engineering which encompasses Data Engineering, Data Operations, and
+Core Platform. We have a [number of open positions](/careers/#open-positions)
+at the moment including a position to work closely with me as [Data Engineering
+Manager](https://jobs.lever.co/scribd/e2187c1c-a1d6-4b77-bde6-acc997f68156).
+The leader of the Data Engineering team will help deliver data tools and
+solutions for internal customers building on top of Delta Lake, Databricks,
+Airflow, and Kafka. Suffice it to say, there's a lot of really interesting work
+to be done!
diff --git a/_posts/2021-07-21-information-extraction-at-scribd.md b/_posts/2021-07-21-information-extraction-at-scribd.md
new file mode 100644
index 0000000..6708e45
--- /dev/null
+++ b/_posts/2021-07-21-information-extraction-at-scribd.md
@@ -0,0 +1,163 @@
+---
+layout: post
+title: "Information Extraction at Scribd"
+tags:
+- machinelearning
+- data
+- featured
+- kyc-series
+team: Applied Research
+authors:
+- antoniam
+- rafaelp
+---
+
+Extracting metadata from our documents is an important part of our discovery
+and recommendation pipeline, but discerning useful and relevant details
+from text-heavy user-uploaded documents can be challenging. This is
+part 2 in a series of blog posts describing a multi-component machine learning
+system the Applied Research team built to extract metadata from our documents in order to enrich downstream discovery models. In this post, we present the challenges and
+limitations the team faced when building information extraction NLP models for Scribd's 
+text-heavy documents and how they were solved.
+
+As mentioned in [part 1](/blog/2021/identifying-document-types.html), we now have a way of identifying text-heavy documents. Having done that, we want to build dedicated models to deepen our semantic understanding of them. We do this by extracting keyphrases and entities.
+
+<figure>
+    <img width="662" alt="Figure 1: Diagram of our multi-component machine learning system. " src="https://user-images.githubusercontent.com/11147367/126206943-9deabf5f-6add-4a01-9e20-5ed8f9e10069.png">
+  <figcaption> Figure 1: Diagram of our multi-component machine learning system. </figcaption>
+</figure>
+
+Keyphrases are phrases that represent major themes/topics, whereas entities are proper nouns such as people, places and organizations. For example, when a user uploads a document about the Manhattan project, we will first detect it is text-heavy, then extract keyphrases and entities. Potential keyphrases would be “atomic bomb” and “nuclear weapons” and potential entities would be “Robert Oppenheimer” and “Los Alamos”.
+
+As keyphrase extraction brings out the general topics discussed in a document, it helps put a cap on the amount of information kept per document, resulting in a somewhat uniform representation of documents irrespective of their original size. Entity extraction, on the other hand, identifies elements in a text that aren't necessarily reflected by keyphrases only. We found the combination of keyphrase and entity extraction to provide a rich semantic description of each document.
+
+The rest of this post will explain how we approached keyphrase and entity extraction, and how we identified whether a subset of these keyphrases and entities are present in a knowledge base (also known as linking), and introduce how we use them to categorize documents.
+
+## Keyphrase Extraction
+
+Typically a keyphrase extraction system operates in two steps as indicated in this survey: 
+
+- Using heuristics to extract a list of words/phrases that serve as candidate keyphrases, such as part-of-speech language patterns, stopwords filtering, and n-grams with Wikipedia article titles
+
+- Determining which of these candidate keyphrases are most likely to be keyphrases, using one of the two approaches:
+
+  - Supervised approaches such as binary classification of candidates (useful/not useful), structural features based on positional encoding, etc.
+
+  - Unsupervised approaches such as selecting terms with the highest tf-idf and clustering.
+
+Training a decent supervised model to be able to extract keyphrases across a wide variety of topics would require a large amount of training data, and might generalize very poorly. For this reason, we decided to take the unsupervised approach.
+
+Our implementation of keyphrase extraction is optimized for speed without sacrificing keyphrase quality much. We employ both a statistical method and language specific rules to identify them efficiently.
+
+We simply start by filtering out stopwords and extracting the n-grams with a base n (bi-grams in our case, n=2). This step is fast and straightforward and results in an initial set of candidate n-grams. 
+
+Limiting the results to a single n-gram class, however, results in split keyphrases, which makes linking them to a knowledge base a challenging task. For that, we attempt to agglomerate lower order n-grams into potentially longer keyphrases, as long as they occur at a predetermined minimum frequency as compared to the shorter n-gram, based on the following a pattern: 
+
+`A sequence of nouns (NN) possibly interleaved with either Coordinating Conjunctions (CC) or Prepositions and Subordinating Conjunctions (IN).`
+
+Here are a few examples:
+
+- Assuming the minimum frequency of agglomeration is 0.5, that means we would only replace the bi-gram `world (NN) health (NN)` by `world (NN) health (NN) organization (NN)` as long as `world health organization` occurs at least 50% as much as `world health` occurs. 
+
+- Replace `Human (NNP) Development (NNP)` with `Center(NNP) for (IN) Global (NNP) Development (NNP)` only if the latter occurs at least a predetermined percentage of time as compared to the former.
+
+This method results in more coherent and complete keyphrases that could be linked more accurately to a knowledge base entry.
+
+Finally we use the count of occurrences of the candidate keyphrase as a proxy to its importance. This method is reliable for longer documents, as the repetition of a keyphrase tends to reliably indicate its centrality to the document’s topic. 
+
+## Named Entities
+
+Keyphrases are only one side of finding what’s important in a document. To further capture what a document is about, we must also consider the named entities that are present.
+
+Named Entity Extraction systems identify instances of named entities in a text, which we can count in order to represent their importance in the document, similar to how we did with keyphrases.
+
+Naively counting named entities through exact string matches surfaces an interesting problem: a single entity may go by many names or aliases, which means string frequency is an unreliable measurement of importance. In the example given in Figure 2, we know that “MIll”, “John Stuart Mill” and “Stuart Mill” all refer to the same person. This means that Mill is even more central to the document than the table indicates, since he is referred to a total of 8 times instead of 5.
+
+
+<figure>
+    <img width="662" alt="Figure 2: Excerpt from John Stuart Mill’s Wikipedia page (left) and Top 5 Named Entity counts of the first few paragraphs (right)." src="https://user-images.githubusercontent.com/11147367/126206932-a5612459-e597-4340-a379-d62da58a29dc.png">
+  <figcaption> Figure 2: Excerpt from John Stuart Mill’s Wikipedia page (left) and Top 5 Named Entity counts of the first few paragraphs (right). </figcaption>
+</figure>
+
+To address this counting problem, let's introduce a few abstractions:
+
+- `Named Entity` refers to a unique person, place or organization. Because of their uniqueness, we can represent them with a unique identifier (ID). 
+
+- `Named Entity Alias` (or simply Alias), is one of possibly many names associated with a particular entity.
+
+- `Canonical Alias` is the preferred name for an entity.
+
+- `Named Entity Mention` (or simply `Mention`), refers to each occurrence in a text that a Named Entity was referred to, regardless of which Alias was used.
+
+- `Knowledge Base` is a collection of entities, allowing us to query for ID, canonical name, aliases and other information that might be relevant for the task at hand. One example is [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page).
+
+The first step to solve the counting problem is to normalize the names a document uses to refer to a named entity. Using our abstractions, this means we want to find all the mentions in a document, and use its alias to find the named entity it belongs to. Then, replace it with either the canonical name or the named entity ID - this distinction will become clearer later on.
+
+### Entity Normalization
+
+Given a set of aliases that appear in a document, we developed heuristics (e.g. common tokens, initials) to identify which subset of aliases refer to the same named entity. This allowed us to limit our search space when comparing aliases.
+
+Using our previous example to illustrate this method, we start by assuming the canonical alias is the longest alias in a text for a given entity, and attempt to merge aliases together by evaluating which aliases match the heuristics we developed. 
+
+<figure>
+    <img width="662" alt="Table 1: Top 5 occurring aliases in the first few paragraphs of John Stuart Mill’s Wikipedia page, some referring to the same person.
+" src="https://user-images.githubusercontent.com/11147367/126228221-7b9c6062-5ba3-4734-ae7f-49b00244792d.png">
+  <figcaption> Table 1: Top 5 occurring aliases in the first few paragraphs of John Stuart Mill’s Wikipedia page, some referring to the same person.
+ </figcaption>
+</figure>
+
+Comparing entities with each other using exact token matching as a heuristic would solve this:
+
+<figure>
+    <img width="662" alt="Table 2: Pairwise alias comparisons and resulting merges. Matches highlighted in bold." src="https://user-images.githubusercontent.com/11147367/126228205-d657fd20-3bc1-408a-b8e6-70a40fb74ade.png">
+  <figcaption> Table 2: Pairwise alias comparisons and resulting merges. Matches highlighted in bold.
+ </figcaption>
+</figure>
+
+By replacing all mentions with its corresponding canonical alias, we are able to find the correct named entity counts.
+
+One edge case is when an alias might refer to more than one entity: e.g. the alias “Potter” could refer to the named entities “Harry Potter” or “James Potter” within the Harry Potter universe. To solve this, we built an Entity Linker, which determines which named entity is the most likely to match the alias given the context. This process is further explained in the Linking to a Knowledge Base section.
+
+When an entity is not present in a knowledge base, we cannot use Named Entity Linking to disambiguate. In this case, our solution uses a fallback method that assigns the ambiguous mention (Potter) to the closest occurring unambiguous mention that matches the heuristics (e.g. Harry). 
+
+## Linking to a Knowledge Base
+
+Given that many keyphrases and entities mentioned in a document are notable, they are likely present in a knowledge base. This allows us to leverage extra information present in the knowledge base to improve the normalization step as well as downstream tasks.
+
+Entity Linking assists normalization by providing information that an alias matches a named entity, which otherwise wouldn't match a heuristic (e.g. “Honest Abe” versus “Abraham Lincoln”). Furthermore, [information in a knowledge base can be used to embed linked entities and keyphrases in the same space as text](https://arxiv.org/abs/1601.01343).
+
+Being able to embed entities in the same space as text is useful, as this unlocks the ability to [compare possible matching named entity IDs with the context in which they’re mentioned](https://arxiv.org/abs/1911.03814), and make a decision on whether an alias we’re considering might be one of the entities in the knowledge base (in which case we will use IDs), or whether the alias doesn't match any entity in the knowledge base, in which case we fall back to using the assumed canonical alias. 
+
+At Scribd we make use of Entity Linking to not only improve the Entity Normalization step, but also to take advantage of entity and keyphrase embeddings as supplemental features.
+
+## Discussion
+
+Putting all of this together, we can:
+
+1. Link documents to keyphrases and entities
+
+1. Find the relative importance of each in a document
+
+1. Take advantage of relevant information in knowledge bases
+
+This has enabled some interesting projects:
+
+In one of them, the Applied Research team built a graph of documents along with their related keyphrases and entities. Embedding documents, keyphrases and entities in the same space allowed us to discover documents by analogy. For example, take `The Count of Monte Cristo` by Alexandre Dumas, a 19th century French novel about revenge. If we add to its embedding the embedding of `science_fiction`, it leads us to a collection of science fiction novels by Jules Verne (another 19th century French author), such as `20,000 Leagues Under the Sea` and `Journey to the Center of the Earth`.
+
+Keyphrase extractions have also been useful in adding clarity to document clusters. By extracting the most common keyphrases of a cluster, we can derive a common theme for the cluster’s content:
+
+
+<figure>
+    <img width="662" alt="Figure 3: Top keyphrases in a document cluster. The keywords imply that the documents therein are related to dentistry & healthcare, which was confirmed by manually inspecting the documents." src="https://user-images.githubusercontent.com/11147367/126206921-31cea5fb-989c-4468-bb0e-508935f20636.png">
+  <figcaption> Figure 3: Top keyphrases in a document cluster. The keywords imply that the documents therein are related to dentistry & healthcare, which was confirmed by manually inspecting the documents. </figcaption>
+</figure>
+
+In yet another project, the team leveraged precomputed knowledge base embeddings to represent a document in space through a composition of the entities and keyphrases it contains. These features allowed us to understand the documents uploaded by our users and improve the content discovery on the platform.
+
+To see how we use the information extracted to classify documents into a
+taxonomy, make sure to check out [part 3](/blog/2021/categorizing-user-uploaded-documents.html).
+
+If you're interested to learn more about the problems Applied Research
+is solving, or the systems which are built around those solutions,
+check out [our open positions!](/careers/#open-positions)
+
diff --git a/_posts/2021-07-28-categorizing-user-uploaded-documents.md b/_posts/2021-07-28-categorizing-user-uploaded-documents.md
new file mode 100644
index 0000000..ddd2349
--- /dev/null
+++ b/_posts/2021-07-28-categorizing-user-uploaded-documents.md
@@ -0,0 +1,140 @@
+---
+layout: post
+title: "Categorizing user-uploaded documents"
+tags:
+- machinelearning
+- data
+- featured
+- kyc-series
+team: Applied Research
+author: moniquec
+---
+
+Scribd offers a variety of publisher and user-uploaded content to our users and
+while the publisher content is rich in metadata, user-uploaded content
+typically is not. Documents uploaded by the users have varied subjects and
+content types which can make it challenging to link them together.  One way to
+connect content can be through a taxonomy - an important type of structured
+information widely used in various domains. In this series, we have already
+shared how we [identify document
+types](/blog/2021/identifying-document-types.html) and [extract information
+from documents](/blog/2021/information-extraction-at-scribd.html), this post
+will discuss how insights from data were used to help build the taxonomy and
+our approach to assign categories to the user-uploaded documents.
+
+
+## Building the taxonomy
+
+The unified taxonomy is a tree-structure with two layers that was designed by combining our Subject Matter Experts' (SME) knowledge of the book industry subject headings ([BISAC](https://bisg.org/page/BISACEdition) categories) and data-driven insights. We used user-reading patterns to find topics that could help enrich our unified taxonomy.
+
+### Data-Driven Insights
+
+Users have been interacting with Scribd content for more than 10 years, building reading patterns throughout time. We leveraged these reading patterns to create dense vector representations of documents similarly to word2vec in text.
+
+<figure>
+    <img width="662" alt="Schematic representation of our approach: reading sequences are used to create vector representations for user uploaded documents. The vector dimension shown is merely illustrative." src="https://user-images.githubusercontent.com/37147739/127040318-ba998c66-5751-4efd-8c92-a078b642aa2e.png">
+  <figcaption> Figure 1: Schematic representation of our approach: reading sequences are used to create vector representations for user uploaded documents. The vector dimension shown is merely illustrative. </figcaption>
+</figure>
+
+For this work we focused only on user uploaded documents and on one type of interaction (reading for a minimum amount of time). The embeddings dimensions (and other hyperparamenters) were chosen to optimize the hit-ratio@20 ([Caselles-Dupré, et al 2018](https://arxiv.org/abs/1804.04212)) increasing how semantically tight the embeddings are.
+
+Now that we have the embeddings we would like to use them to find groups of documents with similar subjects and topics. Finding these groups will help us identify categories that should be added to the taxonomy.
+
+Dimensionality reduction allows for dense clusters of documents to be found more efficiently and accurately in the reduced space in comparison to the original high-dimensional space of our embeddings. We reduced the dimension of the embeddings using the [t-SNE](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html) algorithm. t-SNE has a non-linear approach that can capture the smaller relationships between the points, as well as the global structure of the data. We used an implementation of t-SNE (Fast Fourier Transform accelerated Interpolation-based t-SNE” - [FIt-SNE](https://github.com/KlugerLab/FIt-SNE)) that is flexible and does not sacrifice accuracy for speed.
+
+Finally, we grouped the user-uploaded docs by clustering the reduced embeddings using [HDBSCAN](https://arxiv.org/pdf/1709.04545.pdf). HDBSCAN separates data points into clusters based on the density distribution. It also has a feature to detect noise, which are points that are too far from the nearest detected cluster to belong to it, and lack the density to form their own cluster.
+
+Figure 2 shows the 2D representation of the user-uploaded documents and their groups. The first thing we noticed and is highlighted in this figure is that the major groups are usually represented by language. Not surprisingly users tend to read content mostly on one single language.
+
+<figure>
+    <img width="662" alt="Figure 2: Initial 2D representation of the embeddings using t-SNE and HDBSCAN. Each colored group represents a cluster found by HDBSCAN. Spread grey points were identified as noise." src="https://user-images.githubusercontent.com/37147739/127041136-9ee4f09d-5215-4624-b02f-bc11b82b4cdb.png">
+  <figcaption> Figure 2: Initial 2D representation of the embeddings using t-SNE and HDBSCAN. Each colored group represents a cluster found by HDBSCAN. Spread grey points were identified as noise. </figcaption>
+</figure>
+
+We developed a technique to further split the groups above in smaller clusters that are semantically tighter. The final clusters can be seen in Figure 3.
+
+<figure>
+    <img width="662" alt="Figure 3: Final 2D representation of the embeddings after further splitting of each cluster. Each colored group represents a subcluster found by HDBSCAN for a particular cluster. Spread grey points were identified as noise." src="https://user-images.githubusercontent.com/37147739/127041180-4fd2e8f6-3f31-439b-91dd-ead4749ad68e.png">
+  <figcaption> Figure 3: Final 2D representation of the embeddings after further splitting of each cluster. Each colored group represents a subcluster found by HDBSCAN for a particular cluster. Spread grey points were identified as noise. </figcaption>
+</figure>
+
+After we got the clusters and subclusters shown in Figure 3, an inspection of the English subclusters was performed in order to identify their major subjects and themes. This investigation led to the incorporation of additional categories into the taxonomy, such as Philippine law, Study aids & test prep, and Teaching methods & materials, making the taxonomy broader across different content types and the browsing to this content more straightforward.
+
+## Placing documents into categories
+
+<figure>
+    <img width="552" alt="Figure 4: Diagram of Scribd’s multi-component pipeline. Categorization is one of the downstream tasks highlighted in the diagram." src="https://user-images.githubusercontent.com/37147739/127041306-c60b3453-e2e0-4f50-b283-6584c2ab0c5a.png">
+  <figcaption> Figure 4: Diagram of Scribd’s multi-component pipeline. Categorization is one of the downstream tasks highlighted in the diagram. </figcaption>
+</figure>
+
+Now that we have the taxonomy, it is time to place the documents into categories. Our approach leverages the extracted key phrases and entities discussed in [part II](/blog/2021/information-extraction-at-scribd.html) of the series. Figure 5 illustrates how our model works: we trained a supervised model to place documents identified as text-heavy (see [part I](/blog/2021/identifying-document-types.html)) into categories using key phrases, entities and the text.
+
+<figure>
+    <img width="662" alt="Figure 5: Model architecture to categorize docs." src="https://user-images.githubusercontent.com/37147739/127041352-d40f9d45-7766-410d-90ce-116b23929be3.png">
+  <figcaption> Figure 5: Model architecture to categorize docs. </figcaption>
+</figure>
+
+### Additional insights from data
+
+In the first iteration of the model, we had a dataset for training collected by our experts to fit the definition of each category. Not surprisingly, upon testing the model on unseen data in production, we realized that for some categories the training set was not a complete representation of the type of documents in production that could fit them. For this reason, the model was unable to generalize with the initial given training set. As an example, in the initial training set most documents about countries other than the US were documents about travel. This means that the model learned that whenever a document mentions other countries, the document is most likely about travel. For this reason, documents about business in South America, for instance, would be placed under travel by the model.
+
+We applied a technique sometimes referred to as active learning to supplement our training set with the missing examples. Following this technique (Figure 6), the model is applied to a random sample of documents and the results analyzed by our SMEs.
+
+<figure>
+    <img width="662" alt="Figure 6: Active Learning Process used to improve model performance." src="https://user-images.githubusercontent.com/37147739/127041436-010ca99d-ce71-4d25-9dad-afaed4a427eb.png">
+  <figcaption> Figure 6: Active Learning Process used to improve model performance. </figcaption>
+</figure>
+
+This iterative process had two outcomes: improved the categories performance by re-training the model with a large variety of training example and the addition of a new category after we identified that a good fraction of documents fitted this particular category,
+
+## Additional Experiments
+
+Throughout this project several experiments were performed to explore the full potential of the user interaction clusters.  Here we will show one exciting example of such experiment.
+
+#### Giving names to clusters
+
+As explained above, in general, each subcluster shown in figure 3 is semantically tight which means that the documents belonging to a subcluster are usually about one (or few) topic(s)/subject(s).
+
+One way to associate topics to the subclusters would require Subject Matter Experts to manually inspect the documents in each subcluster and come up with the most important topics for each of them. However, this approach is not only time consuming, and thus not scalable with new iterations of the model and a likely increasing number of clusters. It is very important to try and make this a more automatic and flexible process.
+
+We experimented with a very promising  two-step approach to automatically assign topics to subclusters. In this approach, we leverage the extracted information from the text described in [part II](/blog/2021/information-extraction-at-scribd.html) and zero-shot topic classification (more info [here](https://arxiv.org/abs/1909.00161)):
+
+Step 1 - Find the subclusters' most representative key phrases by clustering their documents' extracted info.
+
+<figure>
+    <img width="662" alt="Figure 7: Illustration of Step 1." src="https://user-images.githubusercontent.com/37147739/127041536-95fb49e9-feea-4700-9699-f2aac5371746.png">
+  <figcaption> Figure 7: Illustration of Step 1. </figcaption>
+</figure>
+
+Step 2 - Use the result of step 1 and zero-shot topic classification to find the highest ranking topics for each subcluster.
+
+<figure>
+    <img width="662" alt="Figure 8: Illustration of Step 2. The bar plot with the highest ranking topics is the result of this approach for a subcluster that contains essays about several literary works." src="https://user-images.githubusercontent.com/37147739/127041581-262f65a9-8077-4aa2-924d-f01ad5d1654a.png">
+  <figcaption> Figure 8: Illustration of Step 2. The bar plot with the highest ranking topics is the result of this approach for a subcluster that contains essays about several literary works. </figcaption>
+</figure>
+
+As it can be seen in figure 8, a cluster composed of literary works' essays has as the highest ranking topic literary criticism showing the potential of this approach for automatically giving names to user interaction clusters.
+
+## Conclusion
+
+Two important takeaways from this journey of categorizing documents were:
+
+**High quality labeled data** - We found that clean and consistently labelled data was much more important to the model than hyperparameter tuning. However, getting enough documents that fit the categories in our diverse corpus was a challenge. Several techniques were used to improve model performance on unseen data. Among them, active learning proved to be an important way to collect additional training samples and to guarantee the required granularity in the training set.
+
+**Annotation alignment**  -  High quality data and model performance are both connected to the annotation process (see more [here](https://www.youtube.com/watch?v=06-AZXmwHjo)). When multiple annotators are involved in the data collection and evaluation, alignment on the definition of each category is crucial for an accurate training and evaluation of the model. This is even more essential in text classification, since associating categories/topics to a text can be a very subjective task, specially when we are dealing with a single-label categorization problem.
+
+This project was an important milestone in understanding our user-uploaded documents: Classifying documents has enabled users to browse documents by category from our unified taxonomy. Additionally, we now have the power of understanding the categories that each user is interested in and interacts with. Combining the user interests with business metrics could help drive innovative and unexpected product decisions as well as enrich discoverability and recommendations.
+
+## Next Steps
+
+**Improve taxonomy using a data driven approach:**
+
+Moving forward, how can we make sure that newly uploaded documents are covered in our taxonomy?
+
+Using a data driven approach to build the taxonomy answers these questions and guarantees more flexibility, comprehensiveness, and specificity as opposed to a manually created taxonomy. As new content is uploaded to our platform and read by users, new user interaction clusters will form and help us identify recent user interests. For instance, during the pandemic, users started uploading documents related to Covid-19. Clustering the documents in 2021 for example, yields an additional cluster related to Covid-19, one that did not exist prior to the pandemic. This approach will help us build a less rigid taxonomy, a taxonomy that reflects Scribd’s vast content and is easily expandable in the long run.
+
+**Multi-language:**
+
+Now that we understand more our user-uploaded content in English and that we have a consistent pipeline to give labels to these documents, we can extend this approach to other languages
+
+This work and post were done in collaboration with my colleague [Antonia Mouawad](https://ca.linkedin.com/in/antoniamouawad) on the Applied Research team. If you're interested to learn more about the problems Applied Research is solving, or the systems which are built around those solutions, check out [our open positions](/careers/#open-positions).
diff --git a/_posts/2021-09-29-android-audio-player-tutorial-with-armadillo.md b/_posts/2021-09-29-android-audio-player-tutorial-with-armadillo.md
new file mode 100644
index 0000000..5f17cad
--- /dev/null
+++ b/_posts/2021-09-29-android-audio-player-tutorial-with-armadillo.md
@@ -0,0 +1,92 @@
+---
+layout: post
+title: "Armadillo makes audio players in Android easy"
+tags:
+- android
+- kotlin
+- armadillo
+- featured
+author: nathans
+team: Android
+---
+
+Armadillo is the fully featured audio player library Scribd uses to play and
+download all of its audiobooks and podcasts, which is [now open
+source](https://github.com/scribd/armadillo). It specializes in playing HLS
+or MP3 content that is broken down into chapters or tracks. It leverages
+[Google’s Exoplayer](https://github.com/google/ExoPlayer/) library for its audio engine. Exoplayer wraps a variety of
+low level audio and video apis but has few opinions of its own for actually
+using audio in an Android app.
+
+![Armadillo Image](https://raw.githubusercontent.com/scribd/armadillo/main/armadillo.webp)
+
+The leap required from Exoplayer to audio player
+is enormous both in terms of the amount of code needed as well as the amount of
+domain knowledge required about complex audio related subjects. Armadillo
+provides a turn-key solution for powering an audio player and providing the
+information to update a UI.
+
+- **Easy-to-use** because it outputs state updates with everything needed for a UI or analytics. Works in the background state.
+- **Effective** because it uses Google’s Exoplayer as the playback engine.
+- **Ready-to-go** out of the box usage for a developer looking to use an audio player.
+- **Robust** because it contains numerous configuration options for supporting most any requirement and includes a number of other android apis
+required for a high quality audio player.
+
+## What does it include?
+
+- Support for HLS and MP3 audio
+- Exoplayer for downloading and playback
+- [MediaBrowserService](https://developer.android.com/reference/android/service/media/MediaBrowserService) so the app can be played in the background, browsed by other apps, and integrated with Android Auto.
+- [MediaSession](https://developer.android.com/reference/android/media/session/MediaSession) to support commands from media controllers, ex. a bluetooth headset.
+
+## Getting Started:
+
+The library is hosted with Github packages so you will need to add the Github registry with authentication to your build.gradle file. See the official docs on authenticating [here](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-gradle-registry#authenticating-to-github-packages). But you will need to:
+
+1. Generate a [personal access token](https://docs.github.com/en/github/authenticating-to-github/keeping-your-account-and-data-secure/creating-a-personal-access-token) from your Github account.
+1. Add the Github package registry with authentication to your `build.gradle` file.
+
+```kotlin
+maven {
+   name = "GitHubPackages"
+   url = uri("https://maven.pkg.github.com/scribd/armadillo-and")
+   credentials {
+       username = "github_username"
+       password = "github_access_token"
+   }
+}
+```
+
+It is as easy as adding this code snippet to your Activity / Fragment to play your first piece of content.
+
+```kotlin
+// construct your media
+val media = AudioPlayable(
+    id = 0,
+    title = "Google Hosted Mp3",
+    request = AudioPlayable.MediaRequest.createHttpUri("https://storage.googleapis.com/exoplayer-test-media-0/play.mp3"),
+    chapters = emptyList()
+)
+
+// initialize the player
+val armadilloPlayer = ArmadilloPlayerFactory.init()
+
+// begin playback
+armadilloPlayer.beginPlayback(media)
+
+// listen for state updates
+armadilloPlayer.armadilloStateObservable.subscribe {
+
+    // update your UI here
+
+}
+```
+
+That’s all you need to get started!
+
+## Next Steps:
+
+For a more complex example, please see the [TestApp](https://github.com/scribd/armadillo/tree/main/TestApp) included in the library. If
+you have any problems, don’t be afraid to open up an issue [on
+GitHub](https://github.com/scribd/armadillo).
+
diff --git a/_posts/2022-04-28-data-ai-summit-2022.md b/_posts/2022-04-28-data-ai-summit-2022.md
new file mode 100644
index 0000000..8916901
--- /dev/null
+++ b/_posts/2022-04-28-data-ai-summit-2022.md
@@ -0,0 +1,45 @@
+---
+layout: post
+title: "Scribd is presenting at Data and AI Summit 2022"
+team: Core Platform
+author: rtyler
+tags:
+- databricks
+- kafka
+- deltalake
+- featured
+---
+
+We are very excited to be presenting and attending this year's [Data and AI
+Summit](https://databricks.com/dataaisummit/north-america-2022) which will be
+hosted virtually and physically in San Francisco from June 27th-30th.
+Throughout the course of 2021 we completed a number of really interesting
+projects built around [delta-rs](https://github.com/delta-io/delta-rs) and the
+Databricks platform which we are thrilled to share with a broader audience.
+In addition to the presentations listed below, a number of Scribd engineers who
+are responsible for data and ML platform, machine learning systems, and more,
+will be in attendance if you want to meet up and learn more about how Scribd
+uses data and ML to change the way the world reads!
+
+
+* [Christian Williams](https://github.com/xianwill) will be sharing some of the
+work he has done developing
+[kafka-delta-ingest](https://github.com/scribd/kafka-delta-ingest) in his talk:
+**[Streaming Data into Delta Lake with Rust and
+Kafka](https://databricks.com/dataaisummit/north-america-2022/agenda/?sessionid=1834)**
+* [QP Hou](https://github.com/houqp), Scribd Emeritus, will be presenting on
+his foundational work to ensure correctness within delta-rs during his session:
+**[Ensuring Correct Distributed Writes to Delta Lake in Rust with Formal
+Verification](https://databricks.com/dataaisummit/north-america-2022/agenda/?sessionid=1623)**
+* [R Tyler Croy](https://github.com/rtyler) will be co-presenting with Gavin
+Edgley from Databricks on the cost analysis work Scribd has done to efficiently
+grow our data platform with **[Doubling the size of the data lake without doubling the
+cost](https://databricks.com/dataaisummit/north-america-2022/agenda/?sessionid=2366)**
+
+
+There are so many great sessions to watch in person or online during the event,
+particularly around [Delta Lake](https://delta.io), which is one of our
+favorite technologies and powers our entire data platform. We are also
+expecting some great ML related talks as data and ML begin to overlap more and
+more. We hope to see you there!
+
diff --git a/_posts/2022-06-28-databricks-serverless.md b/_posts/2022-06-28-databricks-serverless.md
new file mode 100644
index 0000000..1007238
--- /dev/null
+++ b/_posts/2022-06-28-databricks-serverless.md
@@ -0,0 +1,58 @@
+---
+layout: post
+title: "Accelerating Looker with Databricks SQL Serverless"
+tags:
+- looker
+- databricks
+- featured
+team: Core Platform
+author: hamiltonh
+---
+
+We recently migrated Looker to a Databricks SQL Serverless, improving our
+infrastructure cost and reducing the footprint of infrastructure we need to
+worry about! “Databricks SQL” which provides a single load balanced Warehouse
+for executing Spark SQL queries across multiple Spark clusters behind the
+scenes. “Serverless” is an evolution of that concept, rather than running a SQL
+Warehouse in our AWS infrastructure, the entirety of execution happens on the
+Databricks side. With a much simpler and faster interface, queries executed in
+Looker now return results much faster to our users than ever before!
+
+When we originally provisioned our “Databricks SQL” warehouses, we worked
+together with our colleagues at Databricks to ensure [the terraform provider
+for Databricks](https://github.com/databricks/terraform-provider-databricks) is
+ready for production usage, which as of today is Generally Available. That
+original foundation in Terraform allowed us to more easily adopt SQL Serverless
+once it was made available to us.
+
+```hcl
+resource "databricks_sql_warehouse" "warehouse" {
+  name                      = "Looker Serverless"
+  # ...
+  enable_serverless_compute = true
+  # ...
+}
+```
+
+The feature was literally brand new so there were a few integration hurdles we
+had to work through with our colleagues at Databricks, but we got things up and
+running in short order. By adopting SQL Serverless, we could avoid setting up
+special networking, IAM roles, and other resources within our own AWS account,
+we can instead rely on pre-provisioned compute resources within Databricks' own
+infrastructure.  No more headache of ensuring all of the required infra is in
+place and setup correctly!
+
+The switch to Serverless reduced our infra configuration and management
+footprint, which by itself is an improvement. We also noticed a significant
+reduction in cold start times for the SQL Serverless Warehouse compared to the
+standard SQL Warehouse. The faster start-up times meant we could configure even
+lower auto-terminate times on the warehouse, savings us even more on
+unproductive and idle cluster costs.
+
+On the Looker side there really wasn’t any difference in the connection
+configuration other than a URL change. In the end, after some preparation work
+a simple 5 minute change in Looker, and a simple 5 minute change in Terraform
+switched everything over to Databricks SQL Serverless, and we were ready to
+rock! Our BI team is very happy with the performance, especially on cold start
+queries. Our CFO is happy about reducing infrastructure costs. And I’m happy
+about simpler infrastructure!
diff --git a/_posts/2022-07-21-data-ai-summit-videos.md b/_posts/2022-07-21-data-ai-summit-videos.md
new file mode 100644
index 0000000..828f149
--- /dev/null
+++ b/_posts/2022-07-21-data-ai-summit-videos.md
@@ -0,0 +1,45 @@
+---
+layout: post
+title: "Data and AI Summit Wrap-up"
+team: Core Platform
+author: rtyler
+tags:
+- databricks
+- kafka
+- deltalake
+- featured
+---
+
+We brought a whole team to San Francisco to present and attend this year's Data and
+AI Summit, and it was a blast! 
+I
+would consider the event a success both in the attendance to the Scribd hosted
+talks and the number of talks which discussed patterns we have adopted in our
+own data and ML platform.
+The three talks I [wrote about
+previously](/blog/2022/data-ai-summit-2022.html) were well received and have
+since been posted to YouTube along with _hundreds_ of other talks. 
+
+* [Christian Williams](https://github.com/xianwill) shared some of the
+work he has done developing
+[kafka-delta-ingest](https://github.com/scribd/kafka-delta-ingest) in his talk:
+[![Streaming Data into Delta Lake with Rust and Kafka](https://img.youtube.com/vi/do4jsxeKfd4/hqdefault.jpg)](https://www.youtube.com/watch?v=do4jsxeKfd4&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=195)
+* [QP Hou](https://github.com/houqp), Scribd Emeritus, presented on
+his foundational work to ensure correctness within delta-rs during his session:
+[![Ensuring Correct Distributed Writes to Delta Lake in Rust with Formal
+Verification](https://img.youtube.com/vi/ABoCnrVWCKY/hqdefault.jpg)](https://www.youtube.com/watch?v=ABoCnrVWCKY&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=112)
+* [R Tyler Croy](https://github.com/rtyler) co-presented with Gavin
+Edgley from Databricks on the cost analysis work Scribd has done to efficiently
+grow our data platform with:
+[![Doubling the size of the data lake without doubling the cost](https://img.youtube.com/vi/9QDRD0PzqCE/hqdefault.jpg)](https://www.youtube.com/watch?v=9QDRD0PzqCE&list=PLTPXxbhUt-YVWi_cf2UUDc9VZFLoRgu0l&index=122)
+
+Members of the Scribd team participated in a panel to discuss the past,
+present, and future of Delta Lake on the expo floor. We also took advantage of
+the time to have multiple discussions with our colleagues at Databricks about
+their product and engineering roadmap, and where we can work together to
+improve the future of Delta Lake, Unity catalog, and more.
+
+For those working in the data, ML, or infrastructure space, there are a lot of
+_great_ talks available online from the event, which I highly recommend
+checking out. Data and AI Summit is a great event for leaders in the industry
+to get together, so we'll definitely be back next year!
diff --git a/_posts/2024-02-05-evolution-of-mlplatform.md b/_posts/2024-02-05-evolution-of-mlplatform.md
new file mode 100644
index 0000000..37f22c2
--- /dev/null
+++ b/_posts/2024-02-05-evolution-of-mlplatform.md
@@ -0,0 +1,133 @@
+--- 
+layout: post 
+title: "The Evolution of the Machine Learning Platform" 
+team: Machine Learning Platform 
+author: bshaw
+tags: 
+- mlops
+- featured
+- ml-platform-series
+---
+
+Machine Learning Platforms (ML Platforms) have the potential to be a key component in achieving production ML at scale without large technical debt, yet ML Platforms are not often understood. This document outlines the key concepts and paradigm shifts that led to the conceptualization of ML Platforms in an effort to increase an understanding of these platforms and how they can best be applied.
+
+
+Technical Debt and development velocity defined
+-----------------------------------------------
+
+### Development Velocity
+
+Machine learning development velocity refers to the speed and efficiency at which machine learning (ML) projects progress from the initial concept to deployment in a production environment. It encompasses the entire lifecycle of a machine learning project, from data collection and preprocessing to model training, evaluation, validation deployment and testing for new models or for re-training, validation and deployment of existing models.
+
+### Technical Debt
+
+The term "technical debt" in software engineering was coined by Ward Cunningham, Cunningham used the metaphor of financial debt to describe the trade-off between implementing a quick and dirty solution to meet immediate needs (similar to taking on financial debt for short-term gain) versus taking the time to do it properly with a more sustainable and maintainable solution (akin to avoiding financial debt but requiring more upfront investment). Just as financial debt accumulates interest over time, technical debt can accumulate and make future development more difficult and expensive.
+
+The idea behind technical debt is to highlight the consequences of prioritizing short-term gains over long-term maintainability and the need to address and pay off this "debt" through proper refactoring and improvements. The term has since become widely adopted in the software development community to describe the accrued cost of deferred work on a software project.
+
+### Technical Debt in Machine Learning
+
+Originally a software engineering concept, Technical debt is also relevant to Machine Learning Systems infact the landmark google paper suggest that ML systems have the propensity to easily gain this technical debt.
+
+> Machine learning offers a fantastically powerful toolkit for building useful complex prediction systems quickly. This paper argues it is dangerous to think of these quick wins as coming for free. Using the software engineering framework of technical debt , we ﬁnd it is common to incur massive ongoing maintenance costs in real-world ML systems
+> 
+> [Sculley et al (2021) Hidden Technical Debt in Machine Learning Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems)
+
+> As the machine learning (ML) community continues to accumulate years of experience with livesystems, a wide-spread and uncomfortable trend has emerged: developing and deploying ML sys-tems is relatively fast and cheap, but maintaining them over time is difﬁcult and expensive
+> 
+> [Sculley et al (2021) Hidden Technical Debt in Machine Learning Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems)
+
+Technical debt is important to consider especially when trying to move fast. Moving fast is easy, moving fast without acquiring technical debt is alot more complicated.
+
+The Evolution Of ML Platforms
+-----------------------------
+
+### DevOps -- The paradigm shift that led the way
+
+DevOps is a methodology in software development which advocates for teams owning the entire software development lifecycle. This paradigm shift from fragmented teams to end-to-end ownership enhances collaboration and accelerates delivery. Dev ops has become standard practice in modern software development and the adoption of DevOps has been widespread, with many organizations considering it an essential part of their software development and delivery processes. Some of the principles of DevOps are:
+
+1.  **Automation**
+    
+2.  **Continuous Testing**
+    
+3.  **Continuous Monitoring**
+    
+4.  **Collaboration and Communication**
+    
+5.  **Version Control**
+    
+6.  **Feedback Loops**
+    
+
+### Platforms -- Reducing Cognitive Load
+
+This shift to DevOps and teams teams owning the entire development lifecycle introduces a new challenge—additional cognitive load. Cognitive load can be defined as
+
+> The total amount of mental effort a team uses to understand, operate and maintain their designated systems or tasks.
+> 
+> [Skelton & Pais (2019) Team Topologies](https://teamtopologies.com/book)
+
+The weight of the additional load introduced in DevOps of teams owning the entire software development lifecycle can hinder productivity, prompting organizations to seek solutions.
+
+Platforms emerged as a strategic solution, delicately abstracting unnecessary details of the development lifecycle. This abstraction allows engineers to focus on critical tasks, mitigating cognitive load and fostering a more streamlined workflow.
+
+> The purpose of a platform team is to enable stream-aligned teams to deliver work with substantial autonomy. The stream-aligned team maintains full ownership of building, running, and fixing their application in production. The platform team provides internal services to reduce the cognitive load that would be required from stream-aligned teams to develop these underlying services.
+> 
+> [Skelton & Pais (2019) Team Topologies](https://teamtopologies.com/book)
+
+> Infrastructure Platform teams enable organisations to scale delivery by solving common product and non-functional requirements with resilient solutions. This allows other teams to focus on building their own things and releasing value for their users
+> 
+> [Rowse & Shepherd (2022) Building Infrastructure Platforms](https://martinfowler.com/articles/building-infrastructure-platform.html)
+
+### ML Ops -- Reducing technical debt of machine learning
+
+The ability of ML systems to rapidly accumulate technical debt has given rise to the concept of MLOps. MLOps is a methodology that takes inspiration from and incorporates best practices of the DevOps, tailoring them to address the distinctive challenges inherent in machine learning. MLOps applies the established principles of DevOps to machine learning, recognizing that merely a fraction of real-world ML systems comprises the actual ML code. Serving as a crucial bridge between development and the ongoing intricacies of maintaining ML systems.
+MLOps is a methodology that provides a collection of concepts and workflows designed to promote efficiency, collaboration, and sustainability of the ML Lifecycle. Correctly applied MLOps can play a pivotal role controlling technical debt and ensuring the efficiency, reliability, and scalability of the machine learning lifecycle over time.
+
+Scribd's ML Platform -- MLOps and Platforms in Action
+-------------------------------------
+At Scribd we have developed a machine learning platform which provides a curated developer experience for machine learning developers. This platform has been built with MLOps in mind which can be seen through its use of common DevOps principles.
+
+1.  **Automation:**    
+    *  Applying CI/CD strategies to model deployments through the use of Jenkins pipelines which deploy models from the Model Registry to AWS based endpoints.
+    *  Automating Model training throug the use of Airflow DAGS and allowing these DAGS to trigger the deployment pipelines to deploy a model once re-training has occured.
+        
+2.  **Continuous** **Testing:**
+    *   Applying continuous testing as part of a model deployment pipeline, removing the need for manual testing.
+    *   Increased tooling to support model validation testing.
+        
+3.  **Monitoring:**
+    *   Monitoring real time inference endpoints
+    *   Monitoring training DAGS
+    *   Monitoring batch jobs
+        
+4.  **Collaboration and Communication:**
+    *   Feature Store which provides feature discovery and re-use
+    *   Model Database which provides model collaboration
+        
+6.  **Version Control:**
+    *   Applying version control to experiments, machine learning models and features
+        
+
+References
+----------
+
+[Bottcher (2018, March 05). What I Talk About When I Talk About Platforms. https://martinfowler.com/articles/talk-about-platforms.html](https://martinfowler.com/articles/talk-about-platforms.html)
+
+[D. Sculley, Gary Holt, Daniel Golovin, Eugene Davydov, Todd Phillips, Dietmar Ebner, Vinay Chaudhary, Michael Young, Jean-Franc¸ois Crespo, Dan Dennison (2021) Hidden Technical Debt in Machine Learning Systems](https://www.scribd.com/document/428241724/Hidden-Technical-Debt-in-Machine-Learning-Systems)
+
+[Fowler (2022, October 20).Conway's Law. https://martinfowler.com/bliki/ConwaysLaw.html](https://martinfowler.com/bliki/ConwaysLaw.html)
+
+[Galante, what is platform engineering. https://platformengineering.org/blog/what-is-platform-engineering](https://platformengineering.org/blog/what-is-platform-engineering)
+
+[Humanitect, State of Platform Engineering Report](https://www.scribd.com/document/611845499/Whitepaper-State-of-Platform-Engineering-Report)
+
+[Hodgson (2023, July 19).How platform teams get stuff done. https://martinfowler.com/articles/platform-teams-stuff-done.html](https://martinfowler.com/articles/platform-teams-stuff-done.html)
+
+[Murray (2017, April 27. The Art of Platform Thinking. https://www.thoughtworks.com/insights/blog/platforms/art-platform-thinking)](https://www.thoughtworks.com/insights/blog/platforms/art-platform-thinking)
+
+[Rouse (2017, March 20). Technical Debt. https://www.techopedia.com/definition/27913/technical-debt](https://www.techopedia.com/definition/27913/technical-debt)
+
+[Rowse & Shepherd (2022).Building Infrastructure Platforms. https://martinfowler.com/articles/building-infrastructure-platform.html](https://martinfowler.com/articles/building-infrastructure-platform.html)
+
+[Skelton & Pais (2019) Team Topologies](https://teamtopologies.com/book)
diff --git a/_posts/2025-01-15-cloud-native-data-ingestion.md b/_posts/2025-01-15-cloud-native-data-ingestion.md
new file mode 100644
index 0000000..2df1b88
--- /dev/null
+++ b/_posts/2025-01-15-cloud-native-data-ingestion.md
@@ -0,0 +1,35 @@
+---
+layout: post 
+title: "Cloud-native Data Ingestion with AWS Aurora and Delta Lake"
+team: "Core Infrastructure"
+author: rtyler
+tags: 
+- deltalake
+- rust
+- featured
+---
+
+
+One of the major themes for Infrastructure Engineering over the past couple
+years has been higher reliability and better operational efficiency. In a
+recent session with the [Delta Lake](https://delta.io) project I was able to
+share the work led Kuntal Basu and a number of other people to _dramatically_
+improve the efficiency and reliability of our online data ingestion pipeline.
+
+
+> Join Kuntal Basu, Staff Infrastructure Engineer, and R. Tyler Croy, Principal
+> Engineer at Scribd, Inc. as they take you behind the scenes of Scribd’s data
+> ingestion setup. They’ll break down the architecture, explain the tools, and
+> walk you through how they turned off-the-shelf solutions into a robust
+> pipeline.
+
+
+## Video
+
+<center><iframe width="560" height="315" src="https://www.youtube-nocookie.com/embed/h8nCF_OI0O0?si=3v2sb4hUvPEKOKF_" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe></center>
+
+
+## Presentation
+
+<center><iframe src="https://www.slideshare.net/slideshow/embed_code/key/M9NZpsRwwYnjq6?hostedIn=slideshare&page=upload" width="476" height="400" frameborder="0" marginwidth="0" marginheight="0" scrolling="no"></iframe></center>
+
diff --git a/_posts/2025-03-14-terraform-oxbow-module.md b/_posts/2025-03-14-terraform-oxbow-module.md
new file mode 100644
index 0000000..ab48af2
--- /dev/null
+++ b/_posts/2025-03-14-terraform-oxbow-module.md
@@ -0,0 +1,61 @@
+---
+layout: post
+title: "Terraform module to manage Oxbow Lambda and its components"
+tags:
+- Oxbow
+- Terraform
+- AWS
+- deltalake
+- rust
+team: Core Infrastructure
+author: Oleh Motrunych
+---
+
+
+[Oxbow](https://github.com/buoyant-data/oxbow) is a project to take an existing storage location which contains [Apache Parquet](https://parquet.apache.org/) files into a [Delta Lake table](https://delta.io/).
+It is intended to run both as an AWS Lambda or as a command line application.
+We are excited to introduce [terraform-oxbow](https://github.com/scribd/terraform-oxbow), an open-source Terraform module that simplifies the deployment and management of AWS Lambda and its supporting components. Whether you're working with AWS Glue, Kinesis Data Firehose, SQS, or DynamoDB, this module provides a streamlined approach to infrastructure as code (IaC) in AWS.
+
+### ✨ Why terraform-oxbow?
+Managing event-driven architectures in AWS can be complex, requiring careful orchestration of multiple services. Terraform-oxbow abstracts much of this complexity, enabling users to configure key components with simple boolean flags and module parameters. This is an easy and efficient way to have Delta table created using Apache Parquet files.
+### 🚀Features
+
+With **terraform-oxbow**, you can deploy:
+
+- AWS Oxbow Lambda with customizable configurations
+- Kinesis Data Firehose for real-time data streaming
+- SQS and SQS Dead Letter Queues for event-driven messaging
+- IAM policies for secure access management
+- S3 bucket notifications to trigger Lambda functions
+- DynamoDB tables for data storage and locking
+- AWS Glue Catalog and Tables for schema management
+
+
+### ⚙️ How It Works
+
+This module follows a modular approach, allowing users to enable or disable services based on their specific use case. Here are a few examples:
+
+- To enable AWS Glue Catalog and Tables: ```hcl
+enable_aws_glue_catalog_table = true
+```
+
+- To enable Kinesis Data Firehose delivery stream ```hcl
+enable_kinesis_firehose_delivery_stream = true
+```
+
+- To enable S3 bucket notifications ```hcl
+enable_bucket_notification = true
+```
+
+- To enable advanced Oxbow Lambda setup for multi-table filtered optimization ```hcl
+enable_group_events = true
+```
+
+- AWS S3 bucket notifications have limitations: Due to AWS constraints, an S3 bucket can only have a single notification configuration per account. If you need to trigger multiple Lambda functions from the same S3 bucket, consider using event-driven solutions like SNS or SQS.
+
+
+- IAM Policy Management: The module provides the necessary permissions but follows the principle of least privilege. Ensure your IAM policies align with your security requirements.
+
+
+- Scalability and Optimization: The module allows fine-grained control over Lambda concurrency, event filtering, and data processing configurations to optimize costs and performance
+
diff --git a/assets/_sass/component/_post-content.scss b/assets/_sass/component/_post-content.scss
index ff4a7bf..49466f3 100644
--- a/assets/_sass/component/_post-content.scss
+++ b/assets/_sass/component/_post-content.scss
@@ -4,17 +4,13 @@
 
 // Bump of base font size for a comfortable reading experience
 .post-content {
-    font-size: 1rem;
+    // actually 17.875px
+    font-size: rem-calc(22px);
     // make sure really long words and links wrap
     word-wrap: break-word;
     overflow-wrap: break-word;
 
-    @media (min-width: $bp-sm) {
-        font-size: rem-calc(18px);
-    }
-
     @media (min-width: $bp-lg) {
-        font-size: rem-calc(20px);
         line-height: 1.6;
     }
 }
diff --git a/assets/js/jobs.js b/assets/js/jobs.js
index 829719b..8172cb9 100644
--- a/assets/js/jobs.js
+++ b/assets/js/jobs.js
@@ -5,7 +5,7 @@
  *
  * With that disclaimer out of the way...
  *
- * This file handles the fetching of jobs from Lever such that they can be
+ * This file handles the fetching of jobs from Lever^WAshby such that they can be
  * dynamically inserted into different parts of the tech blog
  */
 
@@ -13,7 +13,7 @@
  * This API will return an list of departments which must then be filtered
  * through to find the .postings under each
  */
-const API_URL = 'https://api.lever.co/v0/postings/scribd?group=department&mode=json'
+const API_URL = 'https://api.ashbyhq.com/posting-api/job-board/scribd?includeCompensation=true'
 
 
 /*
@@ -37,21 +37,20 @@ function fetchJobs() {
 
   return fetch(API_URL)
     .then(async (response) => {
-      const departments = await response.json();
+      const board = await response.json();
       /*
        * Since this is the tech blog, we're only pulling a couple of
        * departments
        */
-      departments
-        .filter(d => ['Engineering', 'Data Science', 'Design'].includes(d.title))
-        .forEach((department) => {
-          department.postings.forEach((posting) => {
-            const team = posting.categories.team;
+      board.jobs
+        .filter(j => ['Engineering', 'Product, Design, & Analytics', 'Product'].includes(j.department))
+        .filter(j => !j.title.toLowerCase().includes('marketing'))
+        .forEach((job) => {
+            const team = job.team;
             if (!window.jobsCache[team]) {
               window.jobsCache[team] = [];
             }
-            window.jobsCache[team].push(posting);
-          });
+            window.jobsCache[team].push(job);
         });
       window.jobsFetched = true;
       return window.jobsCache;
@@ -98,9 +97,9 @@ function renderJobs(elem, team, randomLimit) {
       li.innerHTML = `
       <div class="card__body">
           <h5 class="mt-0 mb-1 clamp-2">
-              <a href="http://github.com/$%7Bjob.hostedUrl%7D" target="_blank" class="stretched-link link-text-color">${job.text}</a>
+              <a href="http://github.com/$%7Bjob.jobUrl%7D" target="_blank" class="stretched-link link-text-color">${job.title}</a>
           </h5>
-          <p class="m-0 fs-md monospace text-truncate">${job.categories.location || ''}</p>
+          <p class="m-0 fs-md monospace text-truncate">${job.location || ''}</p>
       </div>
 `;
       elem.appendChild(li);
diff --git a/careers.html b/careers.html
index e70f715..5a5072a 100644
--- a/careers.html
+++ b/careers.html
@@ -18,7 +18,7 @@
         alt="two people sitting around a sofa reading on a computer and tablet">
 
     <div class="text-length-md body-lg">
-        <h2 class="mt-3">Help us change the way the world reads.</h2>
+        <h2 class="mt-3">Help us build the largest and most accessible library connecting storytellers with their audience.</h2>
         <p>Our readers are on a mission to become their best selves, and so are we. We’re not afraid to take risks because we know that — win or lose — we’ll learn from them.</p>
         <p>If you’re a talented team player and want to work somewhere where your input matters, we’d love to talk with you.</p>
 
@@ -60,7 +60,7 @@ <h3 class="h2 mb-3">We believe that the secret to making the perfect product is
 
 <section class="section-container">
     <hgroup class="m-auto text-center text-length-lg">
-        <h2 id="benifits" class="section-heading text-teal">Benifits</h2>
+        <h2 id="benefits" class="section-heading text-teal">Benefits</h2>
         <h3 class="h2">Our team takes great care of us, in return, we take great care of them.</h3>
     </hgroup>
 
@@ -159,14 +159,14 @@ <h3 class="mb-3">${team}
                     li.innerHTML = `
                     <div class="action-cell__body">
                         <h5 class="mb-1">
-                            <a href="http://github.com/$%7Bjob.hostedUrl%7D" target="_blank" class="action-cell__link">
-                            ${job.text}
+                            <a href="http://github.com/$%7Bjob.jobUrl%7D" target="_blank" class="action-cell__link">
+                            ${job.title}
                                 <svg class="hidden-md-up fs-md svg-icon"><use xlink:href="http://github.com/%7B%7B'/assets/images/icons/icon-sprite.svg#arrow-external' | relative_url }}"></use></svg>
                             </a>
                         </h5>
-                        <p class="mb-0 fs-md text-truncate text-muted monospace">${job.categories.location || ''}</p>
+                        <p class="mb-0 fs-md text-truncate text-muted monospace">${job.location || ''}</p>
                     </div>
-                    <a href="http://github.com/$%7Bjob.hostedUrl%7D" target="_blank" class="hidden-md-down btn btn-primary btn-icon-external" tabindex="-1" aria-hidden="true">View Job
+                    <a href="http://github.com/$%7Bjob.jobUrl%7D" target="_blank" class="hidden-md-down btn btn-primary btn-icon-external" tabindex="-1" aria-hidden="true">View Job
                         <svg class="svg-icon"><use xlink:href="http://github.com/%7B%7B'/assets/images/icons/icon-sprite.svg#arrow-external' | relative_url }}"></use></svg>
                     </a>
                     `;
diff --git a/generate-tags b/generate-tags
new file mode 100755
index 0000000..150fa80
--- /dev/null
+++ b/generate-tags
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+
+import glob
+import os
+import re
+
+post_dir = '_posts/'
+tag_dir = 'tag/'
+
+total_tags = []
+for filename in glob.glob(post_dir + '*.md'):
+    matcher = r'^tags:$'
+    with open(filename, 'r') as fd:
+        tagged_line = False
+        for line in fd:
+            if tagged_line:
+                if line.startswith('---'):
+                    tagged_line = False
+                elif not line.startswith('-'):
+                    tagged_line = False
+                else:
+                    total_tags.append(line[1:].strip())
+            if re.match(matcher, line):
+                tagged_line = True
+
+total_tags = set(total_tags)
+
+for tag in glob.glob(tag_dir + '*.md'):
+    os.remove(tag)
+
+if not os.path.exists(tag_dir):
+    os.makedirs(tag_dir)
+
+for tag in total_tags:
+    td = os.path.sep.join([tag_dir, tag])
+    if not os.path.exists(td):
+        os.makedirs(td)
+    with open(os.path.sep.join([td, 'index.md']), 'w+') as fd:
+        fd.write('---\nlayout: tag_page\ntitle: \"Tag: ' + tag + '\"\ntag: ' + tag + '\nrobots: noindex\n---\n')
+print("Tags generated, count", total_tags.__len__())
diff --git a/post-images/2020-03-kafka-series/kafka-player-flow.png b/post-images/2020-03-kafka-series/kafka-player-flow.png
new file mode 100644
index 0000000..38b8fd7
Binary files /dev/null and b/post-images/2020-03-kafka-series/kafka-player-flow.png differ
diff --git a/post-images/2020-03-story-points/efforts.png b/post-images/2020-03-story-points/efforts.png
new file mode 100644
index 0000000..363f6d3
Binary files /dev/null and b/post-images/2020-03-story-points/efforts.png differ
diff --git a/post-images/2020-03-story-points/stop.png b/post-images/2020-03-story-points/stop.png
new file mode 100644
index 0000000..b7d1167
Binary files /dev/null and b/post-images/2020-03-story-points/stop.png differ
diff --git a/post-images/2020-06-delta-lake/this-is-fine.png b/post-images/2020-06-delta-lake/this-is-fine.png
new file mode 100644
index 0000000..c2da6c5
Binary files /dev/null and b/post-images/2020-06-delta-lake/this-is-fine.png differ
diff --git a/post-images/2020-06-delta-lake/view-analytics.png b/post-images/2020-06-delta-lake/view-analytics.png
new file mode 100644
index 0000000..cc0c101
Binary files /dev/null and b/post-images/2020-06-delta-lake/view-analytics.png differ
diff --git a/post-images/2020-06-hotdog/hotdog-metrics.png b/post-images/2020-06-hotdog/hotdog-metrics.png
new file mode 100644
index 0000000..0efad33
Binary files /dev/null and b/post-images/2020-06-hotdog/hotdog-metrics.png differ
diff --git a/post-images/2020-12-databricks/databricks-cluster-configuration-parameters.png b/post-images/2020-12-databricks/databricks-cluster-configuration-parameters.png
new file mode 100644
index 0000000..b128ae2
Binary files /dev/null and b/post-images/2020-12-databricks/databricks-cluster-configuration-parameters.png differ
diff --git a/post-images/2020-12-databricks/spot-pricing-history.png b/post-images/2020-12-databricks/spot-pricing-history.png
new file mode 100644
index 0000000..caaa704
Binary files /dev/null and b/post-images/2020-12-databricks/spot-pricing-history.png differ
diff --git a/post-images/2020-12-recycle-eks-worker/Step-Function-sample-output.png b/post-images/2020-12-recycle-eks-worker/Step-Function-sample-output.png
new file mode 100644
index 0000000..f0ed9bd
Binary files /dev/null and b/post-images/2020-12-recycle-eks-worker/Step-Function-sample-output.png differ
diff --git a/post-images/2020-12-sidekiq-incident-learnings/overall-sidekiq-dashboard.png b/post-images/2020-12-sidekiq-incident-learnings/overall-sidekiq-dashboard.png
new file mode 100644
index 0000000..65d939e
Binary files /dev/null and b/post-images/2020-12-sidekiq-incident-learnings/overall-sidekiq-dashboard.png differ
diff --git a/post-images/2020-12-sidekiq-incident-learnings/overall-worker-metrics.png b/post-images/2020-12-sidekiq-incident-learnings/overall-worker-metrics.png
new file mode 100644
index 0000000..782722e
Binary files /dev/null and b/post-images/2020-12-sidekiq-incident-learnings/overall-worker-metrics.png differ
diff --git a/post-images/2020-12-sidekiq-incident-learnings/sidekiq-disabled-workers.jpeg b/post-images/2020-12-sidekiq-incident-learnings/sidekiq-disabled-workers.jpeg
new file mode 100644
index 0000000..3ee368c
Binary files /dev/null and b/post-images/2020-12-sidekiq-incident-learnings/sidekiq-disabled-workers.jpeg differ
diff --git a/post-images/2020-12-sidekiq-incident-learnings/sidekiq-dropped-workers.jpeg b/post-images/2020-12-sidekiq-incident-learnings/sidekiq-dropped-workers.jpeg
new file mode 100644
index 0000000..df797e3
Binary files /dev/null and b/post-images/2020-12-sidekiq-incident-learnings/sidekiq-dropped-workers.jpeg differ
diff --git a/post-images/2020-12-sidekiq-incident-learnings/sidekiq-incident-handling.png b/post-images/2020-12-sidekiq-incident-learnings/sidekiq-incident-handling.png
new file mode 100644
index 0000000..8509899
Binary files /dev/null and b/post-images/2020-12-sidekiq-incident-learnings/sidekiq-incident-handling.png differ
diff --git a/post-images/2020-12-sidekiq-incident-learnings/sidekiq-web-ui.jpg b/post-images/2020-12-sidekiq-incident-learnings/sidekiq-web-ui.jpg
new file mode 100644
index 0000000..bf9bff9
Binary files /dev/null and b/post-images/2020-12-sidekiq-incident-learnings/sidekiq-web-ui.jpg differ
diff --git a/post-images/2020-12-sidekiq-incident-learnings/single-worker-dashboard.png b/post-images/2020-12-sidekiq-incident-learnings/single-worker-dashboard.png
new file mode 100644
index 0000000..48cc9da
Binary files /dev/null and b/post-images/2020-12-sidekiq-incident-learnings/single-worker-dashboard.png differ
diff --git a/post-images/2021-03-github-datadog/all-velocity-metrics.png b/post-images/2021-03-github-datadog/all-velocity-metrics.png
new file mode 100644
index 0000000..40442d4
Binary files /dev/null and b/post-images/2021-03-github-datadog/all-velocity-metrics.png differ
diff --git a/post-images/2021-03-github-datadog/bad-workflow.png b/post-images/2021-03-github-datadog/bad-workflow.png
new file mode 100644
index 0000000..05ed578
Binary files /dev/null and b/post-images/2021-03-github-datadog/bad-workflow.png differ
diff --git a/post-images/2021-03-github-datadog/good-workflow.png b/post-images/2021-03-github-datadog/good-workflow.png
new file mode 100644
index 0000000..585a887
Binary files /dev/null and b/post-images/2021-03-github-datadog/good-workflow.png differ
diff --git a/post-images/2021-03-github-datadog/job-comparison.png b/post-images/2021-03-github-datadog/job-comparison.png
new file mode 100644
index 0000000..e64c3ba
Binary files /dev/null and b/post-images/2021-03-github-datadog/job-comparison.png differ
diff --git a/post-images/2021-03-github-datadog/team-time-to-merge.png b/post-images/2021-03-github-datadog/team-time-to-merge.png
new file mode 100644
index 0000000..e303961
Binary files /dev/null and b/post-images/2021-03-github-datadog/team-time-to-merge.png differ
diff --git a/post-images/2021-03-sql-delta-import/missing_rows.png b/post-images/2021-03-sql-delta-import/missing_rows.png
new file mode 100644
index 0000000..17ea648
Binary files /dev/null and b/post-images/2021-03-sql-delta-import/missing_rows.png differ
diff --git a/post-images/2021-03-sql-delta-import/row_density_increase.png b/post-images/2021-03-sql-delta-import/row_density_increase.png
new file mode 100644
index 0000000..5aa5123
Binary files /dev/null and b/post-images/2021-03-sql-delta-import/row_density_increase.png differ
diff --git a/post-images/2021-04-ebr-scribd/f1.png b/post-images/2021-04-ebr-scribd/f1.png
new file mode 100644
index 0000000..89e5fd0
Binary files /dev/null and b/post-images/2021-04-ebr-scribd/f1.png differ
diff --git a/post-images/2021-04-ebr-scribd/f2.png b/post-images/2021-04-ebr-scribd/f2.png
new file mode 100644
index 0000000..251ac7e
Binary files /dev/null and b/post-images/2021-04-ebr-scribd/f2.png differ
diff --git a/post-images/2021-04-ebr-scribd/f3.png b/post-images/2021-04-ebr-scribd/f3.png
new file mode 100644
index 0000000..2aedf01
Binary files /dev/null and b/post-images/2021-04-ebr-scribd/f3.png differ
diff --git a/post-images/2021-04-okta-airflow/airflow-login.png b/post-images/2021-04-okta-airflow/airflow-login.png
new file mode 100644
index 0000000..8b59904
Binary files /dev/null and b/post-images/2021-04-okta-airflow/airflow-login.png differ
diff --git a/post-images/2021-04-okta-airflow/okta-tiles.png b/post-images/2021-04-okta-airflow/okta-tiles.png
new file mode 100644
index 0000000..25ab217
Binary files /dev/null and b/post-images/2021-04-okta-airflow/okta-tiles.png differ
diff --git a/post-images/2021-04-okta-airflow/sample-okta-setup.png b/post-images/2021-04-okta-airflow/sample-okta-setup.png
new file mode 100644
index 0000000..09dcc67
Binary files /dev/null and b/post-images/2021-04-okta-airflow/sample-okta-setup.png differ
diff --git a/post-images/2021-05-kafka-delta-ingest/kafka-delta-ingest-deployment.png b/post-images/2021-05-kafka-delta-ingest/kafka-delta-ingest-deployment.png
new file mode 100644
index 0000000..eac68ff
Binary files /dev/null and b/post-images/2021-05-kafka-delta-ingest/kafka-delta-ingest-deployment.png differ
diff --git a/tag/2020/index.md b/tag/2020/index.md
new file mode 100644
index 0000000..ea8418f
--- /dev/null
+++ b/tag/2020/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: 2020"
+tag: 2020
+robots: noindex
+---
diff --git a/tag/abstraction/index.md b/tag/abstraction/index.md
new file mode 100644
index 0000000..ff1285e
--- /dev/null
+++ b/tag/abstraction/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: abstraction"
+tag: abstraction
+robots: noindex
+---
diff --git a/tag/accessibility/index.md b/tag/accessibility/index.md
new file mode 100644
index 0000000..3631a7e
--- /dev/null
+++ b/tag/accessibility/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: accessibility"
+tag: accessibility
+robots: noindex
+---
diff --git a/tag/agile/index.md b/tag/agile/index.md
new file mode 100644
index 0000000..235d746
--- /dev/null
+++ b/tag/agile/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: agile"
+tag: agile
+robots: noindex
+---
diff --git a/tag/airflow-series/index.md b/tag/airflow-series/index.md
new file mode 100644
index 0000000..8a54d78
--- /dev/null
+++ b/tag/airflow-series/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: airflow-series"
+tag: airflow-series
+robots: noindex
+---
diff --git a/tag/airflow/index.md b/tag/airflow/index.md
new file mode 100644
index 0000000..d69baf2
--- /dev/null
+++ b/tag/airflow/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: airflow"
+tag: airflow
+robots: noindex
+---
diff --git a/tag/architecture/index.md b/tag/architecture/index.md
new file mode 100644
index 0000000..ed856de
--- /dev/null
+++ b/tag/architecture/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: architecture"
+tag: architecture
+robots: noindex
+---
diff --git a/tag/aws/index.md b/tag/aws/index.md
new file mode 100644
index 0000000..8c1f9a2
--- /dev/null
+++ b/tag/aws/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: aws"
+tag: aws
+robots: noindex
+---
diff --git a/tag/codegen/index.md b/tag/codegen/index.md
new file mode 100644
index 0000000..c7d1a10
--- /dev/null
+++ b/tag/codegen/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: codegen"
+tag: codegen
+robots: noindex
+---
diff --git a/tag/data/index.md b/tag/data/index.md
new file mode 100644
index 0000000..3bffac5
--- /dev/null
+++ b/tag/data/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: data"
+tag: data
+robots: noindex
+---
diff --git a/tag/databricks/index.md b/tag/databricks/index.md
new file mode 100644
index 0000000..ac13519
--- /dev/null
+++ b/tag/databricks/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: databricks"
+tag: databricks
+robots: noindex
+---
diff --git a/tag/datadog/index.md b/tag/datadog/index.md
new file mode 100644
index 0000000..b255b51
--- /dev/null
+++ b/tag/datadog/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: datadog"
+tag: datadog
+robots: noindex
+---
diff --git a/tag/dataeng/index.md b/tag/dataeng/index.md
new file mode 100644
index 0000000..a8d484e
--- /dev/null
+++ b/tag/dataeng/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: dataeng"
+tag: dataeng
+robots: noindex
+---
diff --git a/tag/datapipe/index.md b/tag/datapipe/index.md
new file mode 100644
index 0000000..1fed3a6
--- /dev/null
+++ b/tag/datapipe/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: datapipe"
+tag: datapipe
+robots: noindex
+---
diff --git a/tag/deltalake/index.md b/tag/deltalake/index.md
new file mode 100644
index 0000000..3f0a3a9
--- /dev/null
+++ b/tag/deltalake/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: deltalake"
+tag: deltalake
+robots: noindex
+---
diff --git a/tag/dependency injection/index.md b/tag/dependency injection/index.md
new file mode 100644
index 0000000..05c2f02
--- /dev/null
+++ b/tag/dependency injection/index.md	
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: dependency injection"
+tag: dependency injection
+robots: noindex
+---
diff --git a/tag/deploys/index.md b/tag/deploys/index.md
new file mode 100644
index 0000000..3a395f1
--- /dev/null
+++ b/tag/deploys/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: deploys"
+tag: deploys
+robots: noindex
+---
diff --git a/tag/design/index.md b/tag/design/index.md
new file mode 100644
index 0000000..a5e1f13
--- /dev/null
+++ b/tag/design/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: design"
+tag: design
+robots: noindex
+---
diff --git a/tag/di-series/index.md b/tag/di-series/index.md
new file mode 100644
index 0000000..ce4b7ea
--- /dev/null
+++ b/tag/di-series/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: di-series"
+tag: di-series
+robots: noindex
+---
diff --git a/tag/docker/index.md b/tag/docker/index.md
new file mode 100644
index 0000000..c65cb42
--- /dev/null
+++ b/tag/docker/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: docker"
+tag: docker
+robots: noindex
+---
diff --git a/tag/ecr/index.md b/tag/ecr/index.md
new file mode 100644
index 0000000..b16a029
--- /dev/null
+++ b/tag/ecr/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: ecr"
+tag: ecr
+robots: noindex
+---
diff --git a/tag/eks/index.md b/tag/eks/index.md
new file mode 100644
index 0000000..22df77f
--- /dev/null
+++ b/tag/eks/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: eks"
+tag: eks
+robots: noindex
+---
diff --git a/tag/elasticache/index.md b/tag/elasticache/index.md
new file mode 100644
index 0000000..4162557
--- /dev/null
+++ b/tag/elasticache/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: elasticache"
+tag: elasticache
+robots: noindex
+---
diff --git a/tag/featured/index.md b/tag/featured/index.md
new file mode 100644
index 0000000..250edfd
--- /dev/null
+++ b/tag/featured/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: featured"
+tag: featured
+robots: noindex
+---
diff --git a/tag/frontend/index.md b/tag/frontend/index.md
new file mode 100644
index 0000000..5151103
--- /dev/null
+++ b/tag/frontend/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: frontend"
+tag: frontend
+robots: noindex
+---
diff --git a/tag/hotdog/index.md b/tag/hotdog/index.md
new file mode 100644
index 0000000..b7c5cf3
--- /dev/null
+++ b/tag/hotdog/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: hotdog"
+tag: hotdog
+robots: noindex
+---
diff --git a/tag/iam/index.md b/tag/iam/index.md
new file mode 100644
index 0000000..80cd2cd
--- /dev/null
+++ b/tag/iam/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: iam"
+tag: iam
+robots: noindex
+---
diff --git a/tag/incident response/index.md b/tag/incident response/index.md
new file mode 100644
index 0000000..5e63125
--- /dev/null
+++ b/tag/incident response/index.md	
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: incident response"
+tag: incident response
+robots: noindex
+---
diff --git a/tag/kafka/index.md b/tag/kafka/index.md
new file mode 100644
index 0000000..52a52bb
--- /dev/null
+++ b/tag/kafka/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: kafka"
+tag: kafka
+robots: noindex
+---
diff --git a/tag/kubernetes/index.md b/tag/kubernetes/index.md
new file mode 100644
index 0000000..7c0f142
--- /dev/null
+++ b/tag/kubernetes/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: kubernetes"
+tag: kubernetes
+robots: noindex
+---
diff --git a/tag/lambda/index.md b/tag/lambda/index.md
new file mode 100644
index 0000000..1c1a6e1
--- /dev/null
+++ b/tag/lambda/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: lambda"
+tag: lambda
+robots: noindex
+---
diff --git a/tag/lc-series/index.md b/tag/lc-series/index.md
new file mode 100644
index 0000000..68cfbf1
--- /dev/null
+++ b/tag/lc-series/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: lc-series"
+tag: lc-series
+robots: noindex
+---
diff --git a/tag/live-collections/index.md b/tag/live-collections/index.md
new file mode 100644
index 0000000..acebe8f
--- /dev/null
+++ b/tag/live-collections/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: live-collections"
+tag: live-collections
+robots: noindex
+---
diff --git a/tag/ltr/index.md b/tag/ltr/index.md
new file mode 100644
index 0000000..9cd85cf
--- /dev/null
+++ b/tag/ltr/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: ltr"
+tag: ltr
+robots: noindex
+---
diff --git a/tag/machinelearning/index.md b/tag/machinelearning/index.md
new file mode 100644
index 0000000..87550c2
--- /dev/null
+++ b/tag/machinelearning/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: machinelearning"
+tag: machinelearning
+robots: noindex
+---
diff --git a/tag/meetup/index.md b/tag/meetup/index.md
new file mode 100644
index 0000000..01f4b4c
--- /dev/null
+++ b/tag/meetup/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: meetup"
+tag: meetup
+robots: noindex
+---
diff --git a/tag/ml-platform-series/index.md b/tag/ml-platform-series/index.md
new file mode 100644
index 0000000..8aa9be9
--- /dev/null
+++ b/tag/ml-platform-series/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: ml-platform-series"
+tag: ml-platform-series
+robots: noindex
+---
diff --git a/tag/mlflow/index.md b/tag/mlflow/index.md
new file mode 100644
index 0000000..bda69e3
--- /dev/null
+++ b/tag/mlflow/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: mlflow"
+tag: mlflow
+robots: noindex
+---
diff --git a/tag/mlops/index.md b/tag/mlops/index.md
new file mode 100644
index 0000000..b51bead
--- /dev/null
+++ b/tag/mlops/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: mlops"
+tag: mlops
+robots: noindex
+---
diff --git a/tag/monitoring/index.md b/tag/monitoring/index.md
new file mode 100644
index 0000000..c8011c9
--- /dev/null
+++ b/tag/monitoring/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: monitoring"
+tag: monitoring
+robots: noindex
+---
diff --git a/tag/msk-series/index.md b/tag/msk-series/index.md
new file mode 100644
index 0000000..7848fd7
--- /dev/null
+++ b/tag/msk-series/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: msk-series"
+tag: msk-series
+robots: noindex
+---
diff --git a/tag/msk/index.md b/tag/msk/index.md
new file mode 100644
index 0000000..2e61cee
--- /dev/null
+++ b/tag/msk/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: msk"
+tag: msk
+robots: noindex
+---
diff --git a/tag/oncall/index.md b/tag/oncall/index.md
new file mode 100644
index 0000000..3b5806f
--- /dev/null
+++ b/tag/oncall/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: oncall"
+tag: oncall
+robots: noindex
+---
diff --git a/tag/pagerduty/index.md b/tag/pagerduty/index.md
new file mode 100644
index 0000000..1c9e2a6
--- /dev/null
+++ b/tag/pagerduty/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: pagerduty"
+tag: pagerduty
+robots: noindex
+---
diff --git a/tag/pytorch/index.md b/tag/pytorch/index.md
new file mode 100644
index 0000000..f79f2c8
--- /dev/null
+++ b/tag/pytorch/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: pytorch"
+tag: pytorch
+robots: noindex
+---
diff --git a/tag/react/index.md b/tag/react/index.md
new file mode 100644
index 0000000..62bb300
--- /dev/null
+++ b/tag/react/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: react"
+tag: react
+robots: noindex
+---
diff --git a/tag/real-time/index.md b/tag/real-time/index.md
new file mode 100644
index 0000000..ce81e7e
--- /dev/null
+++ b/tag/real-time/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: real-time"
+tag: real-time
+robots: noindex
+---
diff --git a/tag/remote/index.md b/tag/remote/index.md
new file mode 100644
index 0000000..7500b22
--- /dev/null
+++ b/tag/remote/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: remote"
+tag: remote
+robots: noindex
+---
diff --git a/tag/rust/index.md b/tag/rust/index.md
new file mode 100644
index 0000000..1354ad4
--- /dev/null
+++ b/tag/rust/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: rust"
+tag: rust
+robots: noindex
+---
diff --git a/tag/scribd/index.md b/tag/scribd/index.md
new file mode 100644
index 0000000..3952e50
--- /dev/null
+++ b/tag/scribd/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: scribd"
+tag: scribd
+robots: noindex
+---
diff --git a/tag/search/index.md b/tag/search/index.md
new file mode 100644
index 0000000..bbef64f
--- /dev/null
+++ b/tag/search/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: search"
+tag: search
+robots: noindex
+---
diff --git a/tag/security/index.md b/tag/security/index.md
new file mode 100644
index 0000000..8f1f37f
--- /dev/null
+++ b/tag/security/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: security"
+tag: security
+robots: noindex
+---
diff --git a/tag/seo/index.md b/tag/seo/index.md
new file mode 100644
index 0000000..8315a0d
--- /dev/null
+++ b/tag/seo/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: seo"
+tag: seo
+robots: noindex
+---
diff --git a/tag/seq2seq/index.md b/tag/seq2seq/index.md
new file mode 100644
index 0000000..f6859db
--- /dev/null
+++ b/tag/seq2seq/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: seq2seq"
+tag: seq2seq
+robots: noindex
+---
diff --git a/tag/sidekiq/index.md b/tag/sidekiq/index.md
new file mode 100644
index 0000000..e662140
--- /dev/null
+++ b/tag/sidekiq/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: sidekiq"
+tag: sidekiq
+robots: noindex
+---
diff --git a/tag/spark/index.md b/tag/spark/index.md
new file mode 100644
index 0000000..792cf89
--- /dev/null
+++ b/tag/spark/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: spark"
+tag: spark
+robots: noindex
+---
diff --git a/tag/step function/index.md b/tag/step function/index.md
new file mode 100644
index 0000000..a283fc5
--- /dev/null
+++ b/tag/step function/index.md	
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: step function"
+tag: step function
+robots: noindex
+---
diff --git a/tag/swift/index.md b/tag/swift/index.md
new file mode 100644
index 0000000..320c452
--- /dev/null
+++ b/tag/swift/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: swift"
+tag: swift
+robots: noindex
+---
diff --git a/tag/syslog/index.md b/tag/syslog/index.md
new file mode 100644
index 0000000..1cc51e6
--- /dev/null
+++ b/tag/syslog/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: syslog"
+tag: syslog
+robots: noindex
+---
diff --git a/tag/terraform/index.md b/tag/terraform/index.md
new file mode 100644
index 0000000..8aae77f
--- /dev/null
+++ b/tag/terraform/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: terraform"
+tag: terraform
+robots: noindex
+---
diff --git a/tag/testing/index.md b/tag/testing/index.md
new file mode 100644
index 0000000..5715551
--- /dev/null
+++ b/tag/testing/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: testing"
+tag: testing
+robots: noindex
+---
diff --git a/tag/toronto/index.md b/tag/toronto/index.md
new file mode 100644
index 0000000..a56e14d
--- /dev/null
+++ b/tag/toronto/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: toronto"
+tag: toronto
+robots: noindex
+---
diff --git a/tag/weaver/index.md b/tag/weaver/index.md
new file mode 100644
index 0000000..a04188e
--- /dev/null
+++ b/tag/weaver/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: weaver"
+tag: weaver
+robots: noindex
+---
diff --git a/tag/webpack/index.md b/tag/webpack/index.md
new file mode 100644
index 0000000..d98dc8f
--- /dev/null
+++ b/tag/webpack/index.md
@@ -0,0 +1,6 @@
+---
+layout: tag_page
+title: "Tag: webpack"
+tag: webpack
+robots: noindex
+---